Browse Source

Merge pull request #30 from THIYAGU22/patch-3

text_preprocessing.py
master
Jeffery Russell GitHub 1 year ago
parent
commit
bb2fb4975d
No known key found for this signature in database GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 80 additions and 0 deletions
  1. +41
    -0
      ML Cookbook/text_preprocessing
  2. +39
    -0
      text_preprocessing.py

+ 41
- 0
ML Cookbook/text_preprocessing View File

@@ -0,0 +1,41 @@
import shutil

from shutil import copyfile

copyfile(src = "../input/cleantext/cleantext.py", dst = "../working/cleantext.py")

# import all our functions
from cleantext import *

#!pylint cleantext

import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer

training = [
" I am master of all",
"I am a absolute learner"
]

generalization = [
"I am absolute learner learner"
]

vectorization = CountVectorizer(
stop_words = "english",
preprocessor = process.master_clean_text)

vectorization.fit(training)

build_vocab = {
value:key
for key , value in vectorization.vocabulary_.items()
}

vocab = [build_vocab[i] for i in range(len(build_vocab))]

pd.DataFrame(
data = vectorization.transform(generalization).toarray(),
index=["generalization"],
columns=vocab
)

+ 39
- 0
text_preprocessing.py View File

@@ -0,0 +1,39 @@
import clean_text

# import all our functions
from clean_text import *

#!pylint cleantext

import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer

training = [
" I am master of all",
"I am a absolute learner"
]

generalization = [
"I am absolute learner learner"
]

vectorization = CountVectorizer(
stop_words = "english",
preprocessor = process.master_clean_text)

vectorization.fit(training)

build_vocab = {
value:key
for key , value in vectorization.vocabulary_.items()
}

vocab = [build_vocab[i] for i in range(len(build_vocab))]

extracted = pd.DataFrame(
data = vectorization.transform(generalization).toarray(),
index=["generalization"],
columns=vocab
)

print(extracted)

Loading…
Cancel
Save