Repository where I mostly put random python scripts.
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

39 lines
746 B

  1. import clean_text
  2. # import all our functions
  3. from clean_text import *
  4. #!pylint cleantext
  5. import pandas as pd
  6. from sklearn.feature_extraction.text import CountVectorizer
  7. training = [
  8. " I am master of all",
  9. "I am a absolute learner"
  10. ]
  11. generalization = [
  12. "I am absolute learner learner"
  13. ]
  14. vectorization = CountVectorizer(
  15. stop_words = "english",
  16. preprocessor = process.master_clean_text)
  17. vectorization.fit(training)
  18. build_vocab = {
  19. value:key
  20. for key , value in vectorization.vocabulary_.items()
  21. }
  22. vocab = [build_vocab[i] for i in range(len(build_vocab))]
  23. extracted = pd.DataFrame(
  24. data = vectorization.transform(generalization).toarray(),
  25. index=["generalization"],
  26. columns=vocab
  27. )
  28. print(extracted)