| 
						
						
						
					 | 
				
				 | 
				
					@ -0,0 +1,29 @@ | 
				
			
			
		
	
		
			
				
					 | 
					 | 
				
				 | 
				
					
 | 
				
			
			
		
	
		
			
				
					 | 
					 | 
				
				 | 
				
					import re as clear | 
				
			
			
		
	
		
			
				
					 | 
					 | 
				
				 | 
				
					
 | 
				
			
			
		
	
		
			
				
					 | 
					 | 
				
				 | 
				
					class process(): | 
				
			
			
		
	
		
			
				
					 | 
					 | 
				
				 | 
				
					    def master_clean_text(text): | 
				
			
			
		
	
		
			
				
					 | 
					 | 
				
				 | 
				
					            #clean up all the html tags | 
				
			
			
		
	
		
			
				
					 | 
					 | 
				
				 | 
				
					            text = clear.sub(r'<.*?>','',text) | 
				
			
			
		
	
		
			
				
					 | 
					 | 
				
				 | 
				
					            #remove the unwanted punctation chars | 
				
			
			
		
	
		
			
				
					 | 
					 | 
				
				 | 
				
					
 | 
				
			
			
		
	
		
			
				
					 | 
					 | 
				
				 | 
				
					            text = clear.sub(r"\\","",text) | 
				
			
			
		
	
		
			
				
					 | 
					 | 
				
				 | 
				
					            text = clear.sub(r"\'","",text) | 
				
			
			
		
	
		
			
				
					 | 
					 | 
				
				 | 
				
					            text = clear.sub(r"\"","",text) | 
				
			
			
		
	
		
			
				
					 | 
					 | 
				
				 | 
				
					
 | 
				
			
			
		
	
		
			
				
					 | 
					 | 
				
				 | 
				
					            # coversion to lowercase to remove complexity | 
				
			
			
		
	
		
			
				
					 | 
					 | 
				
				 | 
				
					            text = text.strip().lower() | 
				
			
			
		
	
		
			
				
					 | 
					 | 
				
				 | 
				
					
 | 
				
			
			
		
	
		
			
				
					 | 
					 | 
				
				 | 
				
					            #removing unwanted expressions | 
				
			
			
		
	
		
			
				
					 | 
					 | 
				
				 | 
				
					
 | 
				
			
			
		
	
		
			
				
					 | 
					 | 
				
				 | 
				
					            unwanted = '!"\'#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n' | 
				
			
			
		
	
		
			
				
					 | 
					 | 
				
				 | 
				
					
 | 
				
			
			
		
	
		
			
				
					 | 
					 | 
				
				 | 
				
					            convert = dict((c," ") for c in unwanted) | 
				
			
			
		
	
		
			
				
					 | 
					 | 
				
				 | 
				
					
 | 
				
			
			
		
	
		
			
				
					 | 
					 | 
				
				 | 
				
					            # str.maketrans() --->> creates a one to one mapping of a character to its translation/replacement. | 
				
			
			
		
	
		
			
				
					 | 
					 | 
				
				 | 
				
					            mapping_trans = str.maketrans(convert) | 
				
			
			
		
	
		
			
				
					 | 
					 | 
				
				 | 
				
					
 | 
				
			
			
		
	
		
			
				
					 | 
					 | 
				
				 | 
				
					            text = text.translate(mapping_trans) | 
				
			
			
		
	
		
			
				
					 | 
					 | 
				
				 | 
				
					
 | 
				
			
			
		
	
		
			
				
					 | 
					 | 
				
				 | 
				
					            return text | 
				
			
			
		
	
		
			
				
					 | 
					 | 
				
				 | 
				
					    #master_clean_text("<a> Say youre scrapping a text from you'r website !! WEll it might be swap CASE or unevened you wanna remove all the punctation's into separate WOrd !!!!</a>").split() |