Repository where I mostly put random python scripts.
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

41 lines
824 B

  1. import shutil
  2. from shutil import copyfile
  3. copyfile(src = "../input/cleantext/cleantext.py", dst = "../working/cleantext.py")
  4. # import all our functions
  5. from cleantext import *
  6. #!pylint cleantext
  7. import pandas as pd
  8. from sklearn.feature_extraction.text import CountVectorizer
  9. training = [
  10. " I am master of all",
  11. "I am a absolute learner"
  12. ]
  13. generalization = [
  14. "I am absolute learner learner"
  15. ]
  16. vectorization = CountVectorizer(
  17. stop_words = "english",
  18. preprocessor = process.master_clean_text)
  19. vectorization.fit(training)
  20. build_vocab = {
  21. value:key
  22. for key , value in vectorization.vocabulary_.items()
  23. }
  24. vocab = [build_vocab[i] for i in range(len(build_vocab))]
  25. pd.DataFrame(
  26. data = vectorization.transform(generalization).toarray(),
  27. index=["generalization"],
  28. columns=vocab
  29. )