Repository where I mostly put random python scripts.
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

29 lines
1.0 KiB

  1. import re as clear
  2. class process():
  3. def master_clean_text(text):
  4. #clean up all the html tags
  5. text = clear.sub(r'<.*?>','',text)
  6. #remove the unwanted punctation chars
  7. text = clear.sub(r"\\","",text)
  8. text = clear.sub(r"\'","",text)
  9. text = clear.sub(r"\"","",text)
  10. # coversion to lowercase to remove complexity
  11. text = text.strip().lower()
  12. #removing unwanted expressions
  13. unwanted = '!"\'#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n'
  14. convert = dict((c," ") for c in unwanted)
  15. # str.maketrans() --->> creates a one to one mapping of a character to its translation/replacement.
  16. mapping_trans = str.maketrans(convert)
  17. text = text.translate(mapping_trans)
  18. return text
  19. #master_clean_text("<a> Say youre scrapping a text from you'r website !! WEll it might be swap CASE or unevened you wanna remove all the punctation's into separate WOrd !!!!</a>").split()