|
@ -0,0 +1,41 @@ |
|
|
|
|
|
import shutil |
|
|
|
|
|
|
|
|
|
|
|
from shutil import copyfile |
|
|
|
|
|
|
|
|
|
|
|
copyfile(src = "../input/cleantext/cleantext.py", dst = "../working/cleantext.py") |
|
|
|
|
|
|
|
|
|
|
|
# import all our functions |
|
|
|
|
|
from cleantext import * |
|
|
|
|
|
|
|
|
|
|
|
#!pylint cleantext |
|
|
|
|
|
|
|
|
|
|
|
import pandas as pd |
|
|
|
|
|
from sklearn.feature_extraction.text import CountVectorizer |
|
|
|
|
|
|
|
|
|
|
|
training = [ |
|
|
|
|
|
" I am master of all", |
|
|
|
|
|
"I am a absolute learner" |
|
|
|
|
|
] |
|
|
|
|
|
|
|
|
|
|
|
generalization = [ |
|
|
|
|
|
"I am absolute learner learner" |
|
|
|
|
|
] |
|
|
|
|
|
|
|
|
|
|
|
vectorization = CountVectorizer( |
|
|
|
|
|
stop_words = "english", |
|
|
|
|
|
preprocessor = process.master_clean_text) |
|
|
|
|
|
|
|
|
|
|
|
vectorization.fit(training) |
|
|
|
|
|
|
|
|
|
|
|
build_vocab = { |
|
|
|
|
|
value:key |
|
|
|
|
|
for key , value in vectorization.vocabulary_.items() |
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
vocab = [build_vocab[i] for i in range(len(build_vocab))] |
|
|
|
|
|
|
|
|
|
|
|
pd.DataFrame( |
|
|
|
|
|
data = vectorization.transform(generalization).toarray(), |
|
|
|
|
|
index=["generalization"], |
|
|
|
|
|
columns=vocab |
|
|
|
|
|
) |