|
@ -0,0 +1,284 @@ |
|
|
|
|
|
{ |
|
|
|
|
|
"cells": [ |
|
|
|
|
|
{ |
|
|
|
|
|
"cell_type": "code", |
|
|
|
|
|
"execution_count": 1, |
|
|
|
|
|
"metadata": {}, |
|
|
|
|
|
"outputs": [], |
|
|
|
|
|
"source": [ |
|
|
|
|
|
"import gensim\n" |
|
|
|
|
|
] |
|
|
|
|
|
}, |
|
|
|
|
|
{ |
|
|
|
|
|
"cell_type": "code", |
|
|
|
|
|
"execution_count": 4, |
|
|
|
|
|
"metadata": {}, |
|
|
|
|
|
"outputs": [], |
|
|
|
|
|
"source": [ |
|
|
|
|
|
"model = gensim.models.KeyedVectors.load_word2vec_format('./GoogleNews-vectors-negative300.bin', binary=True) " |
|
|
|
|
|
] |
|
|
|
|
|
}, |
|
|
|
|
|
{ |
|
|
|
|
|
"cell_type": "code", |
|
|
|
|
|
"execution_count": 6, |
|
|
|
|
|
"metadata": {}, |
|
|
|
|
|
"outputs": [ |
|
|
|
|
|
{ |
|
|
|
|
|
"data": { |
|
|
|
|
|
"text/plain": [ |
|
|
|
|
|
"[('hi', 0.654898464679718),\n", |
|
|
|
|
|
" ('goodbye', 0.639905571937561),\n", |
|
|
|
|
|
" ('howdy', 0.6310957074165344),\n", |
|
|
|
|
|
" ('goodnight', 0.5920578241348267),\n", |
|
|
|
|
|
" ('greeting', 0.5855878591537476),\n", |
|
|
|
|
|
" ('Hello', 0.5842196941375732),\n", |
|
|
|
|
|
" (\"g'day\", 0.5754077434539795),\n", |
|
|
|
|
|
" ('See_ya', 0.5688871145248413),\n", |
|
|
|
|
|
" ('ya_doin', 0.5643119812011719),\n", |
|
|
|
|
|
" ('greet', 0.5636603832244873)]" |
|
|
|
|
|
] |
|
|
|
|
|
}, |
|
|
|
|
|
"execution_count": 6, |
|
|
|
|
|
"metadata": {}, |
|
|
|
|
|
"output_type": "execute_result" |
|
|
|
|
|
} |
|
|
|
|
|
], |
|
|
|
|
|
"source": [ |
|
|
|
|
|
"model.most_similar(\"hello\")" |
|
|
|
|
|
] |
|
|
|
|
|
}, |
|
|
|
|
|
{ |
|
|
|
|
|
"cell_type": "code", |
|
|
|
|
|
"execution_count": 8, |
|
|
|
|
|
"metadata": {}, |
|
|
|
|
|
"outputs": [ |
|
|
|
|
|
{ |
|
|
|
|
|
"data": { |
|
|
|
|
|
"text/plain": [ |
|
|
|
|
|
"[('coders', 0.6104331612586975),\n", |
|
|
|
|
|
" ('coder', 0.6063331365585327),\n", |
|
|
|
|
|
" ('Coding', 0.5804804563522339),\n", |
|
|
|
|
|
" ('formatting', 0.5671651363372803),\n", |
|
|
|
|
|
" ('soluble_receptors', 0.5576372146606445),\n", |
|
|
|
|
|
" ('ICD9', 0.5571348667144775),\n", |
|
|
|
|
|
" ('refactoring', 0.5495434999465942),\n", |
|
|
|
|
|
" ('database_schemas', 0.5372464656829834),\n", |
|
|
|
|
|
" ('recode', 0.534299373626709),\n", |
|
|
|
|
|
" ('XHTML_CSS', 0.5328801870346069)]" |
|
|
|
|
|
] |
|
|
|
|
|
}, |
|
|
|
|
|
"execution_count": 8, |
|
|
|
|
|
"metadata": {}, |
|
|
|
|
|
"output_type": "execute_result" |
|
|
|
|
|
} |
|
|
|
|
|
], |
|
|
|
|
|
"source": [ |
|
|
|
|
|
"model.most_similar(\"coding\")" |
|
|
|
|
|
] |
|
|
|
|
|
}, |
|
|
|
|
|
{ |
|
|
|
|
|
"cell_type": "code", |
|
|
|
|
|
"execution_count": 9, |
|
|
|
|
|
"metadata": {}, |
|
|
|
|
|
"outputs": [ |
|
|
|
|
|
{ |
|
|
|
|
|
"data": { |
|
|
|
|
|
"text/plain": [ |
|
|
|
|
|
"[('cats', 0.8099379539489746),\n", |
|
|
|
|
|
" ('dog', 0.7609456777572632),\n", |
|
|
|
|
|
" ('kitten', 0.7464985251426697),\n", |
|
|
|
|
|
" ('feline', 0.7326233983039856),\n", |
|
|
|
|
|
" ('beagle', 0.7150583267211914),\n", |
|
|
|
|
|
" ('puppy', 0.7075453996658325),\n", |
|
|
|
|
|
" ('pup', 0.6934291124343872),\n", |
|
|
|
|
|
" ('pet', 0.6891531348228455),\n", |
|
|
|
|
|
" ('felines', 0.6755931377410889),\n", |
|
|
|
|
|
" ('chihuahua', 0.6709762215614319)]" |
|
|
|
|
|
] |
|
|
|
|
|
}, |
|
|
|
|
|
"execution_count": 9, |
|
|
|
|
|
"metadata": {}, |
|
|
|
|
|
"output_type": "execute_result" |
|
|
|
|
|
} |
|
|
|
|
|
], |
|
|
|
|
|
"source": [ |
|
|
|
|
|
"model.most_similar(\"cat\")" |
|
|
|
|
|
] |
|
|
|
|
|
}, |
|
|
|
|
|
{ |
|
|
|
|
|
"cell_type": "code", |
|
|
|
|
|
"execution_count": 16, |
|
|
|
|
|
"metadata": {}, |
|
|
|
|
|
"outputs": [ |
|
|
|
|
|
{ |
|
|
|
|
|
"name": "stdout", |
|
|
|
|
|
"output_type": "stream", |
|
|
|
|
|
"text": [ |
|
|
|
|
|
"hi globe \n" |
|
|
|
|
|
] |
|
|
|
|
|
} |
|
|
|
|
|
], |
|
|
|
|
|
"source": [ |
|
|
|
|
|
"def transformSentence(sentence):\n", |
|
|
|
|
|
" outputSentence = \"\"\n", |
|
|
|
|
|
" \n", |
|
|
|
|
|
" for word in sentence.split(\" \"):\n", |
|
|
|
|
|
" try:\n", |
|
|
|
|
|
" outputSentence += model.most_similar(word)[0][0] + \" \"\n", |
|
|
|
|
|
" except Exception:\n", |
|
|
|
|
|
" outputSentence += word + \" \"\n", |
|
|
|
|
|
" return outputSentence\n", |
|
|
|
|
|
"\n", |
|
|
|
|
|
"print(transformSentence(\"hello world\"))" |
|
|
|
|
|
] |
|
|
|
|
|
}, |
|
|
|
|
|
{ |
|
|
|
|
|
"cell_type": "code", |
|
|
|
|
|
"execution_count": 12, |
|
|
|
|
|
"metadata": {}, |
|
|
|
|
|
"outputs": [ |
|
|
|
|
|
{ |
|
|
|
|
|
"name": "stdout", |
|
|
|
|
|
"output_type": "stream", |
|
|
|
|
|
"text": [ |
|
|
|
|
|
"looks Mom No hand \n" |
|
|
|
|
|
] |
|
|
|
|
|
} |
|
|
|
|
|
], |
|
|
|
|
|
"source": [ |
|
|
|
|
|
"print(transformSentence(\"look mom no hands\"))" |
|
|
|
|
|
] |
|
|
|
|
|
}, |
|
|
|
|
|
{ |
|
|
|
|
|
"cell_type": "code", |
|
|
|
|
|
"execution_count": 17, |
|
|
|
|
|
"metadata": {}, |
|
|
|
|
|
"outputs": [ |
|
|
|
|
|
{ |
|
|
|
|
|
"name": "stdout", |
|
|
|
|
|
"output_type": "stream", |
|
|
|
|
|
"text": [ |
|
|
|
|
|
"This gen_eral concept of Clustering was to groups Data wtih similiar trait \n" |
|
|
|
|
|
] |
|
|
|
|
|
} |
|
|
|
|
|
], |
|
|
|
|
|
"source": [ |
|
|
|
|
|
"print(transformSentence(\"The general idea of clustering is to group data with similar traits\"))" |
|
|
|
|
|
] |
|
|
|
|
|
}, |
|
|
|
|
|
{ |
|
|
|
|
|
"cell_type": "code", |
|
|
|
|
|
"execution_count": 52, |
|
|
|
|
|
"metadata": {}, |
|
|
|
|
|
"outputs": [ |
|
|
|
|
|
{ |
|
|
|
|
|
"name": "stdout", |
|
|
|
|
|
"output_type": "stream", |
|
|
|
|
|
"text": [ |
|
|
|
|
|
"This manager concept of clusters was to groups datasets wtih similiar traits. \n" |
|
|
|
|
|
] |
|
|
|
|
|
} |
|
|
|
|
|
], |
|
|
|
|
|
"source": [ |
|
|
|
|
|
"def removeFromString(string, chars):\n", |
|
|
|
|
|
" for c in chars:\n", |
|
|
|
|
|
" string = string.replace(c, \"\")\n", |
|
|
|
|
|
" return string\n", |
|
|
|
|
|
"\n", |
|
|
|
|
|
"\n", |
|
|
|
|
|
"def transformSentenceWithHeuristic(sentence):\n", |
|
|
|
|
|
" outputSentence = \"\"\n", |
|
|
|
|
|
" \n", |
|
|
|
|
|
" for word in sentence.split(\" \"):\n", |
|
|
|
|
|
" try:\n", |
|
|
|
|
|
" changed = False\n", |
|
|
|
|
|
" for w, _ in model.most_similar(word):\n", |
|
|
|
|
|
" clean = removeFromString(w, [' ', '_']).lower()\n", |
|
|
|
|
|
" if clean not in word.lower() and \"_\" not in w:\n", |
|
|
|
|
|
" outputSentence += w + \" \"\n", |
|
|
|
|
|
" changed = True\n", |
|
|
|
|
|
" break\n", |
|
|
|
|
|
" outputSentence = outputSentence if changed else outputSentence + word + \" \"\n", |
|
|
|
|
|
" except Exception:\n", |
|
|
|
|
|
" outputSentence += word + \" \"\n", |
|
|
|
|
|
" return outputSentence\n", |
|
|
|
|
|
"print(transformSentenceWithHeuristic(\"The general idea of clustering is to group data with similar traits.\"))" |
|
|
|
|
|
] |
|
|
|
|
|
}, |
|
|
|
|
|
{ |
|
|
|
|
|
"cell_type": "code", |
|
|
|
|
|
"execution_count": 53, |
|
|
|
|
|
"metadata": {}, |
|
|
|
|
|
"outputs": [ |
|
|
|
|
|
{ |
|
|
|
|
|
"name": "stdout", |
|
|
|
|
|
"output_type": "stream", |
|
|
|
|
|
"text": [ |
|
|
|
|
|
"Relax up and grabbing a drinks but that was day it I talking abut this hallucinogenic trips it was this 1981 film Fever Treatment. \n" |
|
|
|
|
|
] |
|
|
|
|
|
} |
|
|
|
|
|
], |
|
|
|
|
|
"source": [ |
|
|
|
|
|
"print(transformSentenceWithHeuristic(\"Sit down and grab a drink because it is time that we talk about the LSD trip that is the 1981 movie Shock Treatment.\"))" |
|
|
|
|
|
] |
|
|
|
|
|
}, |
|
|
|
|
|
{ |
|
|
|
|
|
"cell_type": "code", |
|
|
|
|
|
"execution_count": 54, |
|
|
|
|
|
"metadata": {}, |
|
|
|
|
|
"outputs": [], |
|
|
|
|
|
"source": [ |
|
|
|
|
|
"from sklearn.decomposition import IncrementalPCA # inital reduction\n", |
|
|
|
|
|
"from sklearn.manifold import TSNE # final reduction\n", |
|
|
|
|
|
"import numpy as np # array handling\n", |
|
|
|
|
|
"\n", |
|
|
|
|
|
"\n", |
|
|
|
|
|
"def reduce_dimensions(model):\n", |
|
|
|
|
|
" num_dimensions = 2 # final num dimensions (2D, 3D, etc)\n", |
|
|
|
|
|
"\n", |
|
|
|
|
|
" vectors = [] # positions in vector space\n", |
|
|
|
|
|
" labels = [] # keep track of words to label our data again later\n", |
|
|
|
|
|
" for word in model.wv.vocab:\n", |
|
|
|
|
|
" vectors.append(model.wv[word])\n", |
|
|
|
|
|
" labels.append(word)\n", |
|
|
|
|
|
"\n", |
|
|
|
|
|
" # convert both lists into numpy vectors for reduction\n", |
|
|
|
|
|
" vectors = np.asarray(vectors)\n", |
|
|
|
|
|
" labels = np.asarray(labels)\n", |
|
|
|
|
|
"\n", |
|
|
|
|
|
" # reduce using t-SNE\n", |
|
|
|
|
|
" vectors = np.asarray(vectors)\n", |
|
|
|
|
|
" tsne = TSNE(n_components=num_dimensions, random_state=0)\n", |
|
|
|
|
|
" vectors = tsne.fit_transform(vectors)\n", |
|
|
|
|
|
"\n", |
|
|
|
|
|
" x_vals = [v[0] for v in vectors]\n", |
|
|
|
|
|
" y_vals = [v[1] for v in vectors]\n", |
|
|
|
|
|
" return x_vals, y_vals, labels\n", |
|
|
|
|
|
"\n", |
|
|
|
|
|
"\n", |
|
|
|
|
|
"#x_vals, y_vals, labels = reduce_dimensions(model)" |
|
|
|
|
|
] |
|
|
|
|
|
} |
|
|
|
|
|
], |
|
|
|
|
|
"metadata": { |
|
|
|
|
|
"kernelspec": { |
|
|
|
|
|
"display_name": "Python 3", |
|
|
|
|
|
"language": "python", |
|
|
|
|
|
"name": "python3" |
|
|
|
|
|
}, |
|
|
|
|
|
"language_info": { |
|
|
|
|
|
"codemirror_mode": { |
|
|
|
|
|
"name": "ipython", |
|
|
|
|
|
"version": 3 |
|
|
|
|
|
}, |
|
|
|
|
|
"file_extension": ".py", |
|
|
|
|
|
"mimetype": "text/x-python", |
|
|
|
|
|
"name": "python", |
|
|
|
|
|
"nbconvert_exporter": "python", |
|
|
|
|
|
"pygments_lexer": "ipython3", |
|
|
|
|
|
"version": "3.8.1" |
|
|
|
|
|
} |
|
|
|
|
|
}, |
|
|
|
|
|
"nbformat": 4, |
|
|
|
|
|
"nbformat_minor": 4 |
|
|
|
|
|
} |