From 1d09da3b19637726c41fa189a33a4fa160f25771 Mon Sep 17 00:00:00 2001 From: THIYAGARAJAN Date: Mon, 21 Oct 2019 19:34:31 +0530 Subject: [PATCH] Created clean_text.py This is extension file of text preprocessing which is used to counter the words and to eliminate the Special chars , Upper Case letters --- ML Cookbook/clean_text.py | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) create mode 100644 ML Cookbook/clean_text.py diff --git a/ML Cookbook/clean_text.py b/ML Cookbook/clean_text.py new file mode 100644 index 0000000..d5b58b6 --- /dev/null +++ b/ML Cookbook/clean_text.py @@ -0,0 +1,29 @@ + +import re as clear + +class process(): + def master_clean_text(text): + #clean up all the html tags + text = clear.sub(r'<.*?>','',text) + #remove the unwanted punctation chars + + text = clear.sub(r"\\","",text) + text = clear.sub(r"\'","",text) + text = clear.sub(r"\"","",text) + + # coversion to lowercase to remove complexity + text = text.strip().lower() + + #removing unwanted expressions + + unwanted = '!"\'#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n' + + convert = dict((c," ") for c in unwanted) + + # str.maketrans() --->> creates a one to one mapping of a character to its translation/replacement. + mapping_trans = str.maketrans(convert) + + text = text.translate(mapping_trans) + + return text + #master_clean_text(" Say youre scrapping a text from you'r website !! WEll it might be swap CASE or unevened you wanna remove all the punctation's into separate WOrd !!!!").split()