|
@ -0,0 +1,29 @@ |
|
|
|
|
|
|
|
|
|
|
|
import re as clear |
|
|
|
|
|
|
|
|
|
|
|
class process(): |
|
|
|
|
|
def master_clean_text(text): |
|
|
|
|
|
#clean up all the html tags |
|
|
|
|
|
text = clear.sub(r'<.*?>','',text) |
|
|
|
|
|
#remove the unwanted punctation chars |
|
|
|
|
|
|
|
|
|
|
|
text = clear.sub(r"\\","",text) |
|
|
|
|
|
text = clear.sub(r"\'","",text) |
|
|
|
|
|
text = clear.sub(r"\"","",text) |
|
|
|
|
|
|
|
|
|
|
|
# coversion to lowercase to remove complexity |
|
|
|
|
|
text = text.strip().lower() |
|
|
|
|
|
|
|
|
|
|
|
#removing unwanted expressions |
|
|
|
|
|
|
|
|
|
|
|
unwanted = '!"\'#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n' |
|
|
|
|
|
|
|
|
|
|
|
convert = dict((c," ") for c in unwanted) |
|
|
|
|
|
|
|
|
|
|
|
# str.maketrans() --->> creates a one to one mapping of a character to its translation/replacement. |
|
|
|
|
|
mapping_trans = str.maketrans(convert) |
|
|
|
|
|
|
|
|
|
|
|
text = text.translate(mapping_trans) |
|
|
|
|
|
|
|
|
|
|
|
return text |
|
|
|
|
|
#master_clean_text("<a> Say youre scrapping a text from you'r website !! WEll it might be swap CASE or unevened you wanna remove all the punctation's into separate WOrd !!!!</a>").split() |