From f6d0a2b97c56994a6f537a0001c28dc2dbd9270a Mon Sep 17 00:00:00 2001 From: THIYAGARAJAN Date: Mon, 21 Oct 2019 19:44:11 +0530 Subject: [PATCH 1/2] text_preprocessing.py Text PreProcessing -- This file does the manipulation work and perform vector method to calculate the word and indicate the number of times --- ML Cookbook/text_preprocessing | 41 ++++++++++++++++++++++++++++++++++ 1 file changed, 41 insertions(+) create mode 100644 ML Cookbook/text_preprocessing diff --git a/ML Cookbook/text_preprocessing b/ML Cookbook/text_preprocessing new file mode 100644 index 0000000..8c10309 --- /dev/null +++ b/ML Cookbook/text_preprocessing @@ -0,0 +1,41 @@ +import shutil + +from shutil import copyfile + +copyfile(src = "../input/cleantext/cleantext.py", dst = "../working/cleantext.py") + +# import all our functions +from cleantext import * + +#!pylint cleantext + +import pandas as pd +from sklearn.feature_extraction.text import CountVectorizer + +training = [ + " I am master of all", + "I am a absolute learner" +] + +generalization = [ + "I am absolute learner learner" +] + +vectorization = CountVectorizer( + stop_words = "english", + preprocessor = process.master_clean_text) + +vectorization.fit(training) + +build_vocab = { + value:key + for key , value in vectorization.vocabulary_.items() +} + +vocab = [build_vocab[i] for i in range(len(build_vocab))] + +pd.DataFrame( +data = vectorization.transform(generalization).toarray(), + index=["generalization"], + columns=vocab +) From 6e04e517a7d3f3bed80e2f1d571fe9a0a7311a2c Mon Sep 17 00:00:00 2001 From: THIYAGARAJAN Date: Tue, 22 Oct 2019 18:27:27 +0530 Subject: [PATCH 2/2] Fixed - to display the output It can able to extract the words and make count of it --- text_preprocessing.py | 39 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 39 insertions(+) create mode 100644 text_preprocessing.py diff --git a/text_preprocessing.py b/text_preprocessing.py new file mode 100644 index 0000000..0b117a7 --- /dev/null +++ b/text_preprocessing.py @@ -0,0 +1,39 @@ +import clean_text + +# import all our functions +from clean_text import * + +#!pylint cleantext + +import pandas as pd +from sklearn.feature_extraction.text import CountVectorizer + +training = [ + " I am master of all", + "I am a absolute learner" +] + +generalization = [ + "I am absolute learner learner" +] + +vectorization = CountVectorizer( + stop_words = "english", + preprocessor = process.master_clean_text) + +vectorization.fit(training) + +build_vocab = { + value:key + for key , value in vectorization.vocabulary_.items() +} + +vocab = [build_vocab[i] for i in range(len(build_vocab))] + +extracted = pd.DataFrame( +data = vectorization.transform(generalization).toarray(), + index=["generalization"], + columns=vocab +) + +print(extracted)