|  |  | @ -0,0 +1,39 @@ | 
			
		
	
		
			
				
					|  |  |  | import clean_text | 
			
		
	
		
			
				
					|  |  |  |  | 
			
		
	
		
			
				
					|  |  |  | # import all our functions | 
			
		
	
		
			
				
					|  |  |  | from clean_text import * | 
			
		
	
		
			
				
					|  |  |  |  | 
			
		
	
		
			
				
					|  |  |  | #!pylint cleantext | 
			
		
	
		
			
				
					|  |  |  |  | 
			
		
	
		
			
				
					|  |  |  | import pandas as pd | 
			
		
	
		
			
				
					|  |  |  | from sklearn.feature_extraction.text import CountVectorizer | 
			
		
	
		
			
				
					|  |  |  |  | 
			
		
	
		
			
				
					|  |  |  | training = [ | 
			
		
	
		
			
				
					|  |  |  | " I am master of all", | 
			
		
	
		
			
				
					|  |  |  | "I am a absolute learner" | 
			
		
	
		
			
				
					|  |  |  | ] | 
			
		
	
		
			
				
					|  |  |  |  | 
			
		
	
		
			
				
					|  |  |  | generalization = [ | 
			
		
	
		
			
				
					|  |  |  | "I am absolute learner learner" | 
			
		
	
		
			
				
					|  |  |  | ] | 
			
		
	
		
			
				
					|  |  |  |  | 
			
		
	
		
			
				
					|  |  |  | vectorization = CountVectorizer( | 
			
		
	
		
			
				
					|  |  |  | stop_words = "english", | 
			
		
	
		
			
				
					|  |  |  | preprocessor = process.master_clean_text) | 
			
		
	
		
			
				
					|  |  |  |  | 
			
		
	
		
			
				
					|  |  |  | vectorization.fit(training) | 
			
		
	
		
			
				
					|  |  |  |  | 
			
		
	
		
			
				
					|  |  |  | build_vocab = { | 
			
		
	
		
			
				
					|  |  |  | value:key | 
			
		
	
		
			
				
					|  |  |  | for key , value in vectorization.vocabulary_.items() | 
			
		
	
		
			
				
					|  |  |  | } | 
			
		
	
		
			
				
					|  |  |  |  | 
			
		
	
		
			
				
					|  |  |  | vocab = [build_vocab[i] for i in range(len(build_vocab))] | 
			
		
	
		
			
				
					|  |  |  |  | 
			
		
	
		
			
				
					|  |  |  | extracted = pd.DataFrame( | 
			
		
	
		
			
				
					|  |  |  | data = vectorization.transform(generalization).toarray(), | 
			
		
	
		
			
				
					|  |  |  | index=["generalization"], | 
			
		
	
		
			
				
					|  |  |  | columns=vocab | 
			
		
	
		
			
				
					|  |  |  | ) | 
			
		
	
		
			
				
					|  |  |  |  | 
			
		
	
		
			
				
					|  |  |  | print(extracted) |