Browse Source

text_preprocessing.py

Text PreProcessing --  This file does the manipulation work and perform vector method to calculate the word and indicate the number of times
pull/30/head
THIYAGARAJAN 5 years ago
committed by GitHub
parent
commit
f6d0a2b97c
No known key found for this signature in database GPG Key ID: 4AEE18F83AFDEB23
1 changed files with 41 additions and 0 deletions
  1. +41
    -0
      ML Cookbook/text_preprocessing

+ 41
- 0
ML Cookbook/text_preprocessing View File

@ -0,0 +1,41 @@
import shutil
from shutil import copyfile
copyfile(src = "../input/cleantext/cleantext.py", dst = "../working/cleantext.py")
# import all our functions
from cleantext import *
#!pylint cleantext
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
training = [
" I am master of all",
"I am a absolute learner"
]
generalization = [
"I am absolute learner learner"
]
vectorization = CountVectorizer(
stop_words = "english",
preprocessor = process.master_clean_text)
vectorization.fit(training)
build_vocab = {
value:key
for key , value in vectorization.vocabulary_.items()
}
vocab = [build_vocab[i] for i in range(len(build_vocab))]
pd.DataFrame(
data = vectorization.transform(generalization).toarray(),
index=["generalization"],
columns=vocab
)

Loading…
Cancel
Save