From 6e04e517a7d3f3bed80e2f1d571fe9a0a7311a2c Mon Sep 17 00:00:00 2001 From: THIYAGARAJAN Date: Tue, 22 Oct 2019 18:27:27 +0530 Subject: [PATCH] Fixed - to display the output It can able to extract the words and make count of it --- text_preprocessing.py | 39 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 39 insertions(+) create mode 100644 text_preprocessing.py diff --git a/text_preprocessing.py b/text_preprocessing.py new file mode 100644 index 0000000..0b117a7 --- /dev/null +++ b/text_preprocessing.py @@ -0,0 +1,39 @@ +import clean_text + +# import all our functions +from clean_text import * + +#!pylint cleantext + +import pandas as pd +from sklearn.feature_extraction.text import CountVectorizer + +training = [ + " I am master of all", + "I am a absolute learner" +] + +generalization = [ + "I am absolute learner learner" +] + +vectorization = CountVectorizer( + stop_words = "english", + preprocessor = process.master_clean_text) + +vectorization.fit(training) + +build_vocab = { + value:key + for key , value in vectorization.vocabulary_.items() +} + +vocab = [build_vocab[i] for i in range(len(build_vocab))] + +extracted = pd.DataFrame( +data = vectorization.transform(generalization).toarray(), + index=["generalization"], + columns=vocab +) + +print(extracted)