{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import gensim\n" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "model = gensim.models.KeyedVectors.load_word2vec_format('./GoogleNews-vectors-negative300.bin', binary=True) " ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[('hi', 0.654898464679718),\n", " ('goodbye', 0.639905571937561),\n", " ('howdy', 0.6310957074165344),\n", " ('goodnight', 0.5920578241348267),\n", " ('greeting', 0.5855878591537476),\n", " ('Hello', 0.5842196941375732),\n", " (\"g'day\", 0.5754077434539795),\n", " ('See_ya', 0.5688871145248413),\n", " ('ya_doin', 0.5643119812011719),\n", " ('greet', 0.5636603832244873)]" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "model.most_similar(\"hello\")" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[('coders', 0.6104331612586975),\n", " ('coder', 0.6063331365585327),\n", " ('Coding', 0.5804804563522339),\n", " ('formatting', 0.5671651363372803),\n", " ('soluble_receptors', 0.5576372146606445),\n", " ('ICD9', 0.5571348667144775),\n", " ('refactoring', 0.5495434999465942),\n", " ('database_schemas', 0.5372464656829834),\n", " ('recode', 0.534299373626709),\n", " ('XHTML_CSS', 0.5328801870346069)]" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "model.most_similar(\"coding\")" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[('cats', 0.8099379539489746),\n", " ('dog', 0.7609456777572632),\n", " ('kitten', 0.7464985251426697),\n", " ('feline', 0.7326233983039856),\n", " ('beagle', 0.7150583267211914),\n", " ('puppy', 0.7075453996658325),\n", " ('pup', 0.6934291124343872),\n", " ('pet', 0.6891531348228455),\n", " ('felines', 0.6755931377410889),\n", " ('chihuahua', 0.6709762215614319)]" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "model.most_similar(\"cat\")" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "hi globe \n" ] } ], "source": [ "def transformSentence(sentence):\n", " outputSentence = \"\"\n", " \n", " for word in sentence.split(\" \"):\n", " try:\n", " outputSentence += model.most_similar(word)[0][0] + \" \"\n", " except Exception:\n", " outputSentence += word + \" \"\n", " return outputSentence\n", "\n", "print(transformSentence(\"hello world\"))" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "looks Mom No hand \n" ] } ], "source": [ "print(transformSentence(\"look mom no hands\"))" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "This gen_eral concept of Clustering was to groups Data wtih similiar trait \n" ] } ], "source": [ "print(transformSentence(\"The general idea of clustering is to group data with similar traits\"))" ] }, { "cell_type": "code", "execution_count": 52, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "This manager concept of clusters was to groups datasets wtih similiar traits. \n" ] } ], "source": [ "def removeFromString(string, chars):\n", " for c in chars:\n", " string = string.replace(c, \"\")\n", " return string\n", "\n", "\n", "def transformSentenceWithHeuristic(sentence):\n", " outputSentence = \"\"\n", " \n", " for word in sentence.split(\" \"):\n", " try:\n", " changed = False\n", " for w, _ in model.most_similar(word):\n", " clean = removeFromString(w, [' ', '_']).lower()\n", " if clean not in word.lower() and \"_\" not in w:\n", " outputSentence += w + \" \"\n", " changed = True\n", " break\n", " outputSentence = outputSentence if changed else outputSentence + word + \" \"\n", " except Exception:\n", " outputSentence += word + \" \"\n", " return outputSentence\n", "print(transformSentenceWithHeuristic(\"The general idea of clustering is to group data with similar traits.\"))" ] }, { "cell_type": "code", "execution_count": 53, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Relax up and grabbing a drinks but that was day it I talking abut this hallucinogenic trips it was this 1981 film Fever Treatment. \n" ] } ], "source": [ "print(transformSentenceWithHeuristic(\"Sit down and grab a drink because it is time that we talk about the LSD trip that is the 1981 movie Shock Treatment.\"))" ] }, { "cell_type": "code", "execution_count": 54, "metadata": {}, "outputs": [], "source": [ "from sklearn.decomposition import IncrementalPCA # inital reduction\n", "from sklearn.manifold import TSNE # final reduction\n", "import numpy as np # array handling\n", "\n", "\n", "def reduce_dimensions(model):\n", " num_dimensions = 2 # final num dimensions (2D, 3D, etc)\n", "\n", " vectors = [] # positions in vector space\n", " labels = [] # keep track of words to label our data again later\n", " for word in model.wv.vocab:\n", " vectors.append(model.wv[word])\n", " labels.append(word)\n", "\n", " # convert both lists into numpy vectors for reduction\n", " vectors = np.asarray(vectors)\n", " labels = np.asarray(labels)\n", "\n", " # reduce using t-SNE\n", " vectors = np.asarray(vectors)\n", " tsne = TSNE(n_components=num_dimensions, random_state=0)\n", " vectors = tsne.fit_transform(vectors)\n", "\n", " x_vals = [v[0] for v in vectors]\n", " y_vals = [v[1] for v in vectors]\n", " return x_vals, y_vals, labels\n", "\n", "\n", "#x_vals, y_vals, labels = reduce_dimensions(model)" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.1" } }, "nbformat": 4, "nbformat_minor": 4 }