| @ -0,0 +1,29 @@ | |||||
| import re as clear | |||||
| class process(): | |||||
| def master_clean_text(text): | |||||
| #clean up all the html tags | |||||
| text = clear.sub(r'<.*?>','',text) | |||||
| #remove the unwanted punctation chars | |||||
| text = clear.sub(r"\\","",text) | |||||
| text = clear.sub(r"\'","",text) | |||||
| text = clear.sub(r"\"","",text) | |||||
| # coversion to lowercase to remove complexity | |||||
| text = text.strip().lower() | |||||
| #removing unwanted expressions | |||||
| unwanted = '!"\'#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n' | |||||
| convert = dict((c," ") for c in unwanted) | |||||
| # str.maketrans() --->> creates a one to one mapping of a character to its translation/replacement. | |||||
| mapping_trans = str.maketrans(convert) | |||||
| text = text.translate(mapping_trans) | |||||
| return text | |||||
| #master_clean_text("<a> Say youre scrapping a text from you'r website !! WEll it might be swap CASE or unevened you wanna remove all the punctation's into separate WOrd !!!!</a>").split() | |||||