diff --git a/ML Cookbook/clean_text.py b/ML Cookbook/clean_text.py new file mode 100644 index 0000000..d5b58b6 --- /dev/null +++ b/ML Cookbook/clean_text.py @@ -0,0 +1,29 @@ + +import re as clear + +class process(): + def master_clean_text(text): + #clean up all the html tags + text = clear.sub(r'<.*?>','',text) + #remove the unwanted punctation chars + + text = clear.sub(r"\\","",text) + text = clear.sub(r"\'","",text) + text = clear.sub(r"\"","",text) + + # coversion to lowercase to remove complexity + text = text.strip().lower() + + #removing unwanted expressions + + unwanted = '!"\'#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n' + + convert = dict((c," ") for c in unwanted) + + # str.maketrans() --->> creates a one to one mapping of a character to its translation/replacement. + mapping_trans = str.maketrans(convert) + + text = text.translate(mapping_trans) + + return text + #master_clean_text(" Say youre scrapping a text from you'r website !! WEll it might be swap CASE or unevened you wanna remove all the punctation's into separate WOrd !!!!").split()