diff --git a/ML Cookbook/text_preprocessing b/ML Cookbook/text_preprocessing new file mode 100644 index 0000000..8c10309 --- /dev/null +++ b/ML Cookbook/text_preprocessing @@ -0,0 +1,41 @@ +import shutil + +from shutil import copyfile + +copyfile(src = "../input/cleantext/cleantext.py", dst = "../working/cleantext.py") + +# import all our functions +from cleantext import * + +#!pylint cleantext + +import pandas as pd +from sklearn.feature_extraction.text import CountVectorizer + +training = [ + " I am master of all", + "I am a absolute learner" +] + +generalization = [ + "I am absolute learner learner" +] + +vectorization = CountVectorizer( + stop_words = "english", + preprocessor = process.master_clean_text) + +vectorization.fit(training) + +build_vocab = { + value:key + for key , value in vectorization.vocabulary_.items() +} + +vocab = [build_vocab[i] for i in range(len(build_vocab))] + +pd.DataFrame( +data = vectorization.transform(generalization).toarray(), + index=["generalization"], + columns=vocab +) diff --git a/text_preprocessing.py b/text_preprocessing.py new file mode 100644 index 0000000..0b117a7 --- /dev/null +++ b/text_preprocessing.py @@ -0,0 +1,39 @@ +import clean_text + +# import all our functions +from clean_text import * + +#!pylint cleantext + +import pandas as pd +from sklearn.feature_extraction.text import CountVectorizer + +training = [ + " I am master of all", + "I am a absolute learner" +] + +generalization = [ + "I am absolute learner learner" +] + +vectorization = CountVectorizer( + stop_words = "english", + preprocessor = process.master_clean_text) + +vectorization.fit(training) + +build_vocab = { + value:key + for key , value in vectorization.vocabulary_.items() +} + +vocab = [build_vocab[i] for i in range(len(build_vocab))] + +extracted = pd.DataFrame( +data = vectorization.transform(generalization).toarray(), + index=["generalization"], + columns=vocab +) + +print(extracted)