|  | @ -0,0 +1,41 @@ | 
														
													
														
															
																|  |  |  |  |  | import shutil | 
														
													
														
															
																|  |  |  |  |  |  | 
														
													
														
															
																|  |  |  |  |  | from shutil import copyfile | 
														
													
														
															
																|  |  |  |  |  |  | 
														
													
														
															
																|  |  |  |  |  | copyfile(src = "../input/cleantext/cleantext.py", dst = "../working/cleantext.py") | 
														
													
														
															
																|  |  |  |  |  |  | 
														
													
														
															
																|  |  |  |  |  | # import all our functions | 
														
													
														
															
																|  |  |  |  |  | from cleantext import * | 
														
													
														
															
																|  |  |  |  |  |  | 
														
													
														
															
																|  |  |  |  |  | #!pylint cleantext | 
														
													
														
															
																|  |  |  |  |  |  | 
														
													
														
															
																|  |  |  |  |  | import pandas as pd | 
														
													
														
															
																|  |  |  |  |  | from sklearn.feature_extraction.text import CountVectorizer | 
														
													
														
															
																|  |  |  |  |  |  | 
														
													
														
															
																|  |  |  |  |  | training = [ | 
														
													
														
															
																|  |  |  |  |  | " I am master of all", | 
														
													
														
															
																|  |  |  |  |  | "I am a absolute learner" | 
														
													
														
															
																|  |  |  |  |  | ] | 
														
													
														
															
																|  |  |  |  |  |  | 
														
													
														
															
																|  |  |  |  |  | generalization = [ | 
														
													
														
															
																|  |  |  |  |  | "I am absolute learner learner" | 
														
													
														
															
																|  |  |  |  |  | ] | 
														
													
														
															
																|  |  |  |  |  |  | 
														
													
														
															
																|  |  |  |  |  | vectorization = CountVectorizer( | 
														
													
														
															
																|  |  |  |  |  | stop_words = "english", | 
														
													
														
															
																|  |  |  |  |  | preprocessor = process.master_clean_text) | 
														
													
														
															
																|  |  |  |  |  |  | 
														
													
														
															
																|  |  |  |  |  | vectorization.fit(training) | 
														
													
														
															
																|  |  |  |  |  |  | 
														
													
														
															
																|  |  |  |  |  | build_vocab = { | 
														
													
														
															
																|  |  |  |  |  | value:key | 
														
													
														
															
																|  |  |  |  |  | for key , value in vectorization.vocabulary_.items() | 
														
													
														
															
																|  |  |  |  |  | } | 
														
													
														
															
																|  |  |  |  |  |  | 
														
													
														
															
																|  |  |  |  |  | vocab = [build_vocab[i] for i in range(len(build_vocab))] | 
														
													
														
															
																|  |  |  |  |  |  | 
														
													
														
															
																|  |  |  |  |  | pd.DataFrame( | 
														
													
														
															
																|  |  |  |  |  | data = vectorization.transform(generalization).toarray(), | 
														
													
														
															
																|  |  |  |  |  | index=["generalization"], | 
														
													
														
															
																|  |  |  |  |  | columns=vocab | 
														
													
														
															
																|  |  |  |  |  | ) |