From f6d0a2b97c56994a6f537a0001c28dc2dbd9270a Mon Sep 17 00:00:00 2001
From: THIYAGARAJAN <thiyagarajanravi22@gmail.com>
Date: Mon, 21 Oct 2019 19:44:11 +0530
Subject: [PATCH 1/2] text_preprocessing.py

Text PreProcessing --  This file does the manipulation work and perform vector method to calculate the word and indicate the number of times
---
 ML Cookbook/text_preprocessing | 41 ++++++++++++++++++++++++++++++++++
 1 file changed, 41 insertions(+)
 create mode 100644 ML Cookbook/text_preprocessing

diff --git a/ML Cookbook/text_preprocessing b/ML Cookbook/text_preprocessing
new file mode 100644
index 0000000..8c10309
--- /dev/null
+++ b/ML Cookbook/text_preprocessing	
@@ -0,0 +1,41 @@
+import shutil 
+
+from shutil import copyfile
+
+copyfile(src = "../input/cleantext/cleantext.py", dst = "../working/cleantext.py")
+
+# import all our functions
+from cleantext import *
+
+#!pylint cleantext
+
+import pandas as pd
+from sklearn.feature_extraction.text import CountVectorizer
+
+training = [
+    " I am master of all",
+    "I am a absolute learner"
+]
+
+generalization = [
+    "I am absolute learner learner"
+]
+
+vectorization = CountVectorizer(
+    stop_words = "english",
+    preprocessor = process.master_clean_text)
+
+vectorization.fit(training)
+
+build_vocab = {
+     value:key 
+     for key , value in vectorization.vocabulary_.items()
+}
+
+vocab = [build_vocab[i] for i in range(len(build_vocab))]
+
+pd.DataFrame(
+data = vectorization.transform(generalization).toarray(),
+    index=["generalization"],
+    columns=vocab
+)

From 6e04e517a7d3f3bed80e2f1d571fe9a0a7311a2c Mon Sep 17 00:00:00 2001
From: THIYAGARAJAN <thiyagarajanravi22@gmail.com>
Date: Tue, 22 Oct 2019 18:27:27 +0530
Subject: [PATCH 2/2] Fixed - to display the output

It can able to extract the words and make count of it
---
 text_preprocessing.py | 39 +++++++++++++++++++++++++++++++++++++++
 1 file changed, 39 insertions(+)
 create mode 100644 text_preprocessing.py

diff --git a/text_preprocessing.py b/text_preprocessing.py
new file mode 100644
index 0000000..0b117a7
--- /dev/null
+++ b/text_preprocessing.py
@@ -0,0 +1,39 @@
+import clean_text
+
+# import all our functions
+from clean_text import *
+
+#!pylint cleantext
+
+import pandas as pd
+from sklearn.feature_extraction.text import CountVectorizer
+
+training = [
+    " I am master of all",
+    "I am a absolute learner"
+]
+
+generalization = [
+    "I am absolute learner learner"
+]
+
+vectorization = CountVectorizer(
+    stop_words = "english",
+    preprocessor = process.master_clean_text)
+
+vectorization.fit(training)
+
+build_vocab = {
+     value:key 
+     for key , value in vectorization.vocabulary_.items()
+}
+
+vocab = [build_vocab[i] for i in range(len(build_vocab))]
+
+extracted = pd.DataFrame(
+data = vectorization.transform(generalization).toarray(),
+    index=["generalization"],
+    columns=vocab
+)
+
+print(extracted)