Repository where I mostly put random python scripts.
Vous ne pouvez pas sélectionner plus de 25 sujets Les noms de sujets doivent commencer par une lettre ou un nombre, peuvent contenir des tirets ('-') et peuvent comporter jusqu'à 35 caractères.
 
 
 
 

29 lignes
1.0 KiB

import re as clear
class process():
def master_clean_text(text):
#clean up all the html tags
text = clear.sub(r'<.*?>','',text)
#remove the unwanted punctation chars
text = clear.sub(r"\\","",text)
text = clear.sub(r"\'","",text)
text = clear.sub(r"\"","",text)
# coversion to lowercase to remove complexity
text = text.strip().lower()
#removing unwanted expressions
unwanted = '!"\'#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n'
convert = dict((c," ") for c in unwanted)
# str.maketrans() --->> creates a one to one mapping of a character to its translation/replacement.
mapping_trans = str.maketrans(convert)
text = text.translate(mapping_trans)
return text
#master_clean_text("<a> Say youre scrapping a text from you'r website !! WEll it might be swap CASE or unevened you wanna remove all the punctation's into separate WOrd !!!!</a>").split()