|
|
import re as clear
|
|
|
|
class process():
|
|
def master_clean_text(text):
|
|
#clean up all the html tags
|
|
text = clear.sub(r'<.*?>','',text)
|
|
#remove the unwanted punctation chars
|
|
|
|
text = clear.sub(r"\\","",text)
|
|
text = clear.sub(r"\'","",text)
|
|
text = clear.sub(r"\"","",text)
|
|
|
|
# coversion to lowercase to remove complexity
|
|
text = text.strip().lower()
|
|
|
|
#removing unwanted expressions
|
|
|
|
unwanted = '!"\'#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n'
|
|
|
|
convert = dict((c," ") for c in unwanted)
|
|
|
|
# str.maketrans() --->> creates a one to one mapping of a character to its translation/replacement.
|
|
mapping_trans = str.maketrans(convert)
|
|
|
|
text = text.translate(mapping_trans)
|
|
|
|
return text
|
|
#master_clean_text("<a> Say youre scrapping a text from you'r website !! WEll it might be swap CASE or unevened you wanna remove all the punctation's into separate WOrd !!!!</a>").split()
|