Browse Source

Created clean_text.py

This is extension file of text preprocessing which is used to counter the words and to eliminate the Special chars , Upper Case letters
pull/29/head
THIYAGARAJAN 5 years ago
committed by GitHub
parent
commit
1c18de8300
No known key found for this signature in database GPG Key ID: 4AEE18F83AFDEB23
1 changed files with 29 additions and 0 deletions
  1. +29
    -0
      ML Cookbook/clean_text.py

+ 29
- 0
ML Cookbook/clean_text.py View File

@ -0,0 +1,29 @@
import re as clear
class process():
def master_clean_text(text):
#clean up all the html tags
text = clear.sub(r'<.*?>','',text)
#remove the unwanted punctation chars
text = clear.sub(r"\\","",text)
text = clear.sub(r"\'","",text)
text = clear.sub(r"\"","",text)
# coversion to lowercase to remove complexity
text = text.strip().lower()
#removing unwanted expressions
unwanted = '!"\'#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n'
convert = dict((c," ") for c in unwanted)
# str.maketrans() --->> creates a one to one mapping of a character to its translation/replacement.
mapping_trans = str.maketrans(convert)
text = text.translate(mapping_trans)
return text
#master_clean_text("<a> Say youre scrapping a text from you'r website !! WEll it might be swap CASE or unevened you wanna remove all the punctation's into separate WOrd !!!!</a>").split()

Loading…
Cancel
Save