jrtechs
/
jrtechs-RandomScripts
mirror of https://github.com/jrtechs/RandomScripts.git



								import re as clear


								class process():

								    def master_clean_text(text):

								            #clean up all the html tags

								            text = clear.sub(r'<.*?>','',text)

								            #remove the unwanted punctation chars


								            text = clear.sub(r"\\","",text)

								            text = clear.sub(r"\'","",text)

								            text = clear.sub(r"\"","",text)


								            # coversion to lowercase to remove complexity

								            text = text.strip().lower()


								            #removing unwanted expressions


								            unwanted = '!"\'#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n'


								            convert = dict((c," ") for c in unwanted)


								            # str.maketrans() --->> creates a one to one mapping of a character to its translation/replacement.

								            mapping_trans = str.maketrans(convert)


								            text = text.translate(mapping_trans)


								            return text

								    #master_clean_text("<a> Say youre scrapping a text from you'r website !! WEll it might be swap CASE or unevened you wanna remove all the punctation's into separate WOrd !!!!</a>").split()