| @ -0,0 +1,284 @@ | |||||
| { | |||||
| "cells": [ | |||||
| { | |||||
| "cell_type": "code", | |||||
| "execution_count": 1, | |||||
| "metadata": {}, | |||||
| "outputs": [], | |||||
| "source": [ | |||||
| "import gensim\n" | |||||
| ] | |||||
| }, | |||||
| { | |||||
| "cell_type": "code", | |||||
| "execution_count": 4, | |||||
| "metadata": {}, | |||||
| "outputs": [], | |||||
| "source": [ | |||||
| "model = gensim.models.KeyedVectors.load_word2vec_format('./GoogleNews-vectors-negative300.bin', binary=True) " | |||||
| ] | |||||
| }, | |||||
| { | |||||
| "cell_type": "code", | |||||
| "execution_count": 6, | |||||
| "metadata": {}, | |||||
| "outputs": [ | |||||
| { | |||||
| "data": { | |||||
| "text/plain": [ | |||||
| "[('hi', 0.654898464679718),\n", | |||||
| " ('goodbye', 0.639905571937561),\n", | |||||
| " ('howdy', 0.6310957074165344),\n", | |||||
| " ('goodnight', 0.5920578241348267),\n", | |||||
| " ('greeting', 0.5855878591537476),\n", | |||||
| " ('Hello', 0.5842196941375732),\n", | |||||
| " (\"g'day\", 0.5754077434539795),\n", | |||||
| " ('See_ya', 0.5688871145248413),\n", | |||||
| " ('ya_doin', 0.5643119812011719),\n", | |||||
| " ('greet', 0.5636603832244873)]" | |||||
| ] | |||||
| }, | |||||
| "execution_count": 6, | |||||
| "metadata": {}, | |||||
| "output_type": "execute_result" | |||||
| } | |||||
| ], | |||||
| "source": [ | |||||
| "model.most_similar(\"hello\")" | |||||
| ] | |||||
| }, | |||||
| { | |||||
| "cell_type": "code", | |||||
| "execution_count": 8, | |||||
| "metadata": {}, | |||||
| "outputs": [ | |||||
| { | |||||
| "data": { | |||||
| "text/plain": [ | |||||
| "[('coders', 0.6104331612586975),\n", | |||||
| " ('coder', 0.6063331365585327),\n", | |||||
| " ('Coding', 0.5804804563522339),\n", | |||||
| " ('formatting', 0.5671651363372803),\n", | |||||
| " ('soluble_receptors', 0.5576372146606445),\n", | |||||
| " ('ICD9', 0.5571348667144775),\n", | |||||
| " ('refactoring', 0.5495434999465942),\n", | |||||
| " ('database_schemas', 0.5372464656829834),\n", | |||||
| " ('recode', 0.534299373626709),\n", | |||||
| " ('XHTML_CSS', 0.5328801870346069)]" | |||||
| ] | |||||
| }, | |||||
| "execution_count": 8, | |||||
| "metadata": {}, | |||||
| "output_type": "execute_result" | |||||
| } | |||||
| ], | |||||
| "source": [ | |||||
| "model.most_similar(\"coding\")" | |||||
| ] | |||||
| }, | |||||
| { | |||||
| "cell_type": "code", | |||||
| "execution_count": 9, | |||||
| "metadata": {}, | |||||
| "outputs": [ | |||||
| { | |||||
| "data": { | |||||
| "text/plain": [ | |||||
| "[('cats', 0.8099379539489746),\n", | |||||
| " ('dog', 0.7609456777572632),\n", | |||||
| " ('kitten', 0.7464985251426697),\n", | |||||
| " ('feline', 0.7326233983039856),\n", | |||||
| " ('beagle', 0.7150583267211914),\n", | |||||
| " ('puppy', 0.7075453996658325),\n", | |||||
| " ('pup', 0.6934291124343872),\n", | |||||
| " ('pet', 0.6891531348228455),\n", | |||||
| " ('felines', 0.6755931377410889),\n", | |||||
| " ('chihuahua', 0.6709762215614319)]" | |||||
| ] | |||||
| }, | |||||
| "execution_count": 9, | |||||
| "metadata": {}, | |||||
| "output_type": "execute_result" | |||||
| } | |||||
| ], | |||||
| "source": [ | |||||
| "model.most_similar(\"cat\")" | |||||
| ] | |||||
| }, | |||||
| { | |||||
| "cell_type": "code", | |||||
| "execution_count": 16, | |||||
| "metadata": {}, | |||||
| "outputs": [ | |||||
| { | |||||
| "name": "stdout", | |||||
| "output_type": "stream", | |||||
| "text": [ | |||||
| "hi globe \n" | |||||
| ] | |||||
| } | |||||
| ], | |||||
| "source": [ | |||||
| "def transformSentence(sentence):\n", | |||||
| " outputSentence = \"\"\n", | |||||
| " \n", | |||||
| " for word in sentence.split(\" \"):\n", | |||||
| " try:\n", | |||||
| " outputSentence += model.most_similar(word)[0][0] + \" \"\n", | |||||
| " except Exception:\n", | |||||
| " outputSentence += word + \" \"\n", | |||||
| " return outputSentence\n", | |||||
| "\n", | |||||
| "print(transformSentence(\"hello world\"))" | |||||
| ] | |||||
| }, | |||||
| { | |||||
| "cell_type": "code", | |||||
| "execution_count": 12, | |||||
| "metadata": {}, | |||||
| "outputs": [ | |||||
| { | |||||
| "name": "stdout", | |||||
| "output_type": "stream", | |||||
| "text": [ | |||||
| "looks Mom No hand \n" | |||||
| ] | |||||
| } | |||||
| ], | |||||
| "source": [ | |||||
| "print(transformSentence(\"look mom no hands\"))" | |||||
| ] | |||||
| }, | |||||
| { | |||||
| "cell_type": "code", | |||||
| "execution_count": 17, | |||||
| "metadata": {}, | |||||
| "outputs": [ | |||||
| { | |||||
| "name": "stdout", | |||||
| "output_type": "stream", | |||||
| "text": [ | |||||
| "This gen_eral concept of Clustering was to groups Data wtih similiar trait \n" | |||||
| ] | |||||
| } | |||||
| ], | |||||
| "source": [ | |||||
| "print(transformSentence(\"The general idea of clustering is to group data with similar traits\"))" | |||||
| ] | |||||
| }, | |||||
| { | |||||
| "cell_type": "code", | |||||
| "execution_count": 52, | |||||
| "metadata": {}, | |||||
| "outputs": [ | |||||
| { | |||||
| "name": "stdout", | |||||
| "output_type": "stream", | |||||
| "text": [ | |||||
| "This manager concept of clusters was to groups datasets wtih similiar traits. \n" | |||||
| ] | |||||
| } | |||||
| ], | |||||
| "source": [ | |||||
| "def removeFromString(string, chars):\n", | |||||
| " for c in chars:\n", | |||||
| " string = string.replace(c, \"\")\n", | |||||
| " return string\n", | |||||
| "\n", | |||||
| "\n", | |||||
| "def transformSentenceWithHeuristic(sentence):\n", | |||||
| " outputSentence = \"\"\n", | |||||
| " \n", | |||||
| " for word in sentence.split(\" \"):\n", | |||||
| " try:\n", | |||||
| " changed = False\n", | |||||
| " for w, _ in model.most_similar(word):\n", | |||||
| " clean = removeFromString(w, [' ', '_']).lower()\n", | |||||
| " if clean not in word.lower() and \"_\" not in w:\n", | |||||
| " outputSentence += w + \" \"\n", | |||||
| " changed = True\n", | |||||
| " break\n", | |||||
| " outputSentence = outputSentence if changed else outputSentence + word + \" \"\n", | |||||
| " except Exception:\n", | |||||
| " outputSentence += word + \" \"\n", | |||||
| " return outputSentence\n", | |||||
| "print(transformSentenceWithHeuristic(\"The general idea of clustering is to group data with similar traits.\"))" | |||||
| ] | |||||
| }, | |||||
| { | |||||
| "cell_type": "code", | |||||
| "execution_count": 53, | |||||
| "metadata": {}, | |||||
| "outputs": [ | |||||
| { | |||||
| "name": "stdout", | |||||
| "output_type": "stream", | |||||
| "text": [ | |||||
| "Relax up and grabbing a drinks but that was day it I talking abut this hallucinogenic trips it was this 1981 film Fever Treatment. \n" | |||||
| ] | |||||
| } | |||||
| ], | |||||
| "source": [ | |||||
| "print(transformSentenceWithHeuristic(\"Sit down and grab a drink because it is time that we talk about the LSD trip that is the 1981 movie Shock Treatment.\"))" | |||||
| ] | |||||
| }, | |||||
| { | |||||
| "cell_type": "code", | |||||
| "execution_count": 54, | |||||
| "metadata": {}, | |||||
| "outputs": [], | |||||
| "source": [ | |||||
| "from sklearn.decomposition import IncrementalPCA # inital reduction\n", | |||||
| "from sklearn.manifold import TSNE # final reduction\n", | |||||
| "import numpy as np # array handling\n", | |||||
| "\n", | |||||
| "\n", | |||||
| "def reduce_dimensions(model):\n", | |||||
| " num_dimensions = 2 # final num dimensions (2D, 3D, etc)\n", | |||||
| "\n", | |||||
| " vectors = [] # positions in vector space\n", | |||||
| " labels = [] # keep track of words to label our data again later\n", | |||||
| " for word in model.wv.vocab:\n", | |||||
| " vectors.append(model.wv[word])\n", | |||||
| " labels.append(word)\n", | |||||
| "\n", | |||||
| " # convert both lists into numpy vectors for reduction\n", | |||||
| " vectors = np.asarray(vectors)\n", | |||||
| " labels = np.asarray(labels)\n", | |||||
| "\n", | |||||
| " # reduce using t-SNE\n", | |||||
| " vectors = np.asarray(vectors)\n", | |||||
| " tsne = TSNE(n_components=num_dimensions, random_state=0)\n", | |||||
| " vectors = tsne.fit_transform(vectors)\n", | |||||
| "\n", | |||||
| " x_vals = [v[0] for v in vectors]\n", | |||||
| " y_vals = [v[1] for v in vectors]\n", | |||||
| " return x_vals, y_vals, labels\n", | |||||
| "\n", | |||||
| "\n", | |||||
| "#x_vals, y_vals, labels = reduce_dimensions(model)" | |||||
| ] | |||||
| } | |||||
| ], | |||||
| "metadata": { | |||||
| "kernelspec": { | |||||
| "display_name": "Python 3", | |||||
| "language": "python", | |||||
| "name": "python3" | |||||
| }, | |||||
| "language_info": { | |||||
| "codemirror_mode": { | |||||
| "name": "ipython", | |||||
| "version": 3 | |||||
| }, | |||||
| "file_extension": ".py", | |||||
| "mimetype": "text/x-python", | |||||
| "name": "python", | |||||
| "nbconvert_exporter": "python", | |||||
| "pygments_lexer": "ipython3", | |||||
| "version": "3.8.1" | |||||
| } | |||||
| }, | |||||
| "nbformat": 4, | |||||
| "nbformat_minor": 4 | |||||
| } | |||||