|
@ -0,0 +1,39 @@ |
|
|
|
|
|
import clean_text |
|
|
|
|
|
|
|
|
|
|
|
# import all our functions |
|
|
|
|
|
from clean_text import * |
|
|
|
|
|
|
|
|
|
|
|
#!pylint cleantext |
|
|
|
|
|
|
|
|
|
|
|
import pandas as pd |
|
|
|
|
|
from sklearn.feature_extraction.text import CountVectorizer |
|
|
|
|
|
|
|
|
|
|
|
training = [ |
|
|
|
|
|
" I am master of all", |
|
|
|
|
|
"I am a absolute learner" |
|
|
|
|
|
] |
|
|
|
|
|
|
|
|
|
|
|
generalization = [ |
|
|
|
|
|
"I am absolute learner learner" |
|
|
|
|
|
] |
|
|
|
|
|
|
|
|
|
|
|
vectorization = CountVectorizer( |
|
|
|
|
|
stop_words = "english", |
|
|
|
|
|
preprocessor = process.master_clean_text) |
|
|
|
|
|
|
|
|
|
|
|
vectorization.fit(training) |
|
|
|
|
|
|
|
|
|
|
|
build_vocab = { |
|
|
|
|
|
value:key |
|
|
|
|
|
for key , value in vectorization.vocabulary_.items() |
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
vocab = [build_vocab[i] for i in range(len(build_vocab))] |
|
|
|
|
|
|
|
|
|
|
|
extracted = pd.DataFrame( |
|
|
|
|
|
data = vectorization.transform(generalization).toarray(), |
|
|
|
|
|
index=["generalization"], |
|
|
|
|
|
columns=vocab |
|
|
|
|
|
) |
|
|
|
|
|
|
|
|
|
|
|
print(extracted) |