|
|
- {
- "cells": [
- {
- "cell_type": "code",
- "execution_count": 1,
- "metadata": {},
- "outputs": [],
- "source": [
- "import gensim\n"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 4,
- "metadata": {},
- "outputs": [],
- "source": [
- "model = gensim.models.KeyedVectors.load_word2vec_format('./GoogleNews-vectors-negative300.bin', binary=True) "
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 6,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "[('hi', 0.654898464679718),\n",
- " ('goodbye', 0.639905571937561),\n",
- " ('howdy', 0.6310957074165344),\n",
- " ('goodnight', 0.5920578241348267),\n",
- " ('greeting', 0.5855878591537476),\n",
- " ('Hello', 0.5842196941375732),\n",
- " (\"g'day\", 0.5754077434539795),\n",
- " ('See_ya', 0.5688871145248413),\n",
- " ('ya_doin', 0.5643119812011719),\n",
- " ('greet', 0.5636603832244873)]"
- ]
- },
- "execution_count": 6,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "model.most_similar(\"hello\")"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 8,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "[('coders', 0.6104331612586975),\n",
- " ('coder', 0.6063331365585327),\n",
- " ('Coding', 0.5804804563522339),\n",
- " ('formatting', 0.5671651363372803),\n",
- " ('soluble_receptors', 0.5576372146606445),\n",
- " ('ICD9', 0.5571348667144775),\n",
- " ('refactoring', 0.5495434999465942),\n",
- " ('database_schemas', 0.5372464656829834),\n",
- " ('recode', 0.534299373626709),\n",
- " ('XHTML_CSS', 0.5328801870346069)]"
- ]
- },
- "execution_count": 8,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "model.most_similar(\"coding\")"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 9,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "[('cats', 0.8099379539489746),\n",
- " ('dog', 0.7609456777572632),\n",
- " ('kitten', 0.7464985251426697),\n",
- " ('feline', 0.7326233983039856),\n",
- " ('beagle', 0.7150583267211914),\n",
- " ('puppy', 0.7075453996658325),\n",
- " ('pup', 0.6934291124343872),\n",
- " ('pet', 0.6891531348228455),\n",
- " ('felines', 0.6755931377410889),\n",
- " ('chihuahua', 0.6709762215614319)]"
- ]
- },
- "execution_count": 9,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "model.most_similar(\"cat\")"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 16,
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "hi globe \n"
- ]
- }
- ],
- "source": [
- "def transformSentence(sentence):\n",
- " outputSentence = \"\"\n",
- " \n",
- " for word in sentence.split(\" \"):\n",
- " try:\n",
- " outputSentence += model.most_similar(word)[0][0] + \" \"\n",
- " except Exception:\n",
- " outputSentence += word + \" \"\n",
- " return outputSentence\n",
- "\n",
- "print(transformSentence(\"hello world\"))"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 12,
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "looks Mom No hand \n"
- ]
- }
- ],
- "source": [
- "print(transformSentence(\"look mom no hands\"))"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 17,
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "This gen_eral concept of Clustering was to groups Data wtih similiar trait \n"
- ]
- }
- ],
- "source": [
- "print(transformSentence(\"The general idea of clustering is to group data with similar traits\"))"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 52,
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "This manager concept of clusters was to groups datasets wtih similiar traits. \n"
- ]
- }
- ],
- "source": [
- "def removeFromString(string, chars):\n",
- " for c in chars:\n",
- " string = string.replace(c, \"\")\n",
- " return string\n",
- "\n",
- "\n",
- "def transformSentenceWithHeuristic(sentence):\n",
- " outputSentence = \"\"\n",
- " \n",
- " for word in sentence.split(\" \"):\n",
- " try:\n",
- " changed = False\n",
- " for w, _ in model.most_similar(word):\n",
- " clean = removeFromString(w, [' ', '_']).lower()\n",
- " if clean not in word.lower() and \"_\" not in w:\n",
- " outputSentence += w + \" \"\n",
- " changed = True\n",
- " break\n",
- " outputSentence = outputSentence if changed else outputSentence + word + \" \"\n",
- " except Exception:\n",
- " outputSentence += word + \" \"\n",
- " return outputSentence\n",
- "print(transformSentenceWithHeuristic(\"The general idea of clustering is to group data with similar traits.\"))"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 53,
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "Relax up and grabbing a drinks but that was day it I talking abut this hallucinogenic trips it was this 1981 film Fever Treatment. \n"
- ]
- }
- ],
- "source": [
- "print(transformSentenceWithHeuristic(\"Sit down and grab a drink because it is time that we talk about the LSD trip that is the 1981 movie Shock Treatment.\"))"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 54,
- "metadata": {},
- "outputs": [],
- "source": [
- "from sklearn.decomposition import IncrementalPCA # inital reduction\n",
- "from sklearn.manifold import TSNE # final reduction\n",
- "import numpy as np # array handling\n",
- "\n",
- "\n",
- "def reduce_dimensions(model):\n",
- " num_dimensions = 2 # final num dimensions (2D, 3D, etc)\n",
- "\n",
- " vectors = [] # positions in vector space\n",
- " labels = [] # keep track of words to label our data again later\n",
- " for word in model.wv.vocab:\n",
- " vectors.append(model.wv[word])\n",
- " labels.append(word)\n",
- "\n",
- " # convert both lists into numpy vectors for reduction\n",
- " vectors = np.asarray(vectors)\n",
- " labels = np.asarray(labels)\n",
- "\n",
- " # reduce using t-SNE\n",
- " vectors = np.asarray(vectors)\n",
- " tsne = TSNE(n_components=num_dimensions, random_state=0)\n",
- " vectors = tsne.fit_transform(vectors)\n",
- "\n",
- " x_vals = [v[0] for v in vectors]\n",
- " y_vals = [v[1] for v in vectors]\n",
- " return x_vals, y_vals, labels\n",
- "\n",
- "\n",
- "#x_vals, y_vals, labels = reduce_dimensions(model)"
- ]
- }
- ],
- "metadata": {
- "kernelspec": {
- "display_name": "Python 3",
- "language": "python",
- "name": "python3"
- },
- "language_info": {
- "codemirror_mode": {
- "name": "ipython",
- "version": 3
- },
- "file_extension": ".py",
- "mimetype": "text/x-python",
- "name": "python",
- "nbconvert_exporter": "python",
- "pygments_lexer": "ipython3",
- "version": "3.8.1"
- }
- },
- "nbformat": 4,
- "nbformat_minor": 4
- }
|