From b77dff7e658d6fa5419c37fc044802c91389ad54 Mon Sep 17 00:00:00 2001 From: gyoza1 Date: Mon, 12 Dec 2022 12:49:24 -0500 Subject: [PATCH] as of Monday AM --- de_bello_gallico.ipynb | 1224 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 1224 insertions(+) create mode 100644 de_bello_gallico.ipynb diff --git a/de_bello_gallico.ipynb b/de_bello_gallico.ipynb new file mode 100644 index 0000000..bd517bf --- /dev/null +++ b/de_bello_gallico.ipynb @@ -0,0 +1,1224 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "id": "2b2f0cd7-2adc-451d-91c8-afa496cacb3f", + "metadata": {}, + "outputs": [], + "source": [ + "## Requires Python 3.7, 3.8, 3.9, 3.10 on a POSIX-compliant OS\n", + "\n", + "## Install the Classical Language Toolkit\n", + "# !pip install cltk" + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "id": "d3b6884a-0ef5-430a-b872-81fa97b98c56", + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import matplotlib.pyplot as plt\n", + "from cltk import NLP\n", + "from cltk.data.fetch import FetchCorpus\n", + "from sklearn.feature_extraction.text import TfidfVectorizer, TfidfTransformer;\n", + "from sklearn.decomposition import PCA" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "f64d21f4-b285-4ba2-9b3f-8c8a7f29b45c", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['lat_text_perseus',\n", + " 'lat_treebank_perseus',\n", + " 'lat_text_latin_library',\n", + " 'phi5',\n", + " 'phi7',\n", + " 'latin_proper_names_cltk',\n", + " 'lat_models_cltk',\n", + " 'latin_pos_lemmata_cltk',\n", + " 'latin_treebank_index_thomisticus',\n", + " 'latin_lexica_perseus',\n", + " 'latin_training_set_sentence_cltk',\n", + " 'latin_word2vec_cltk',\n", + " 'latin_text_antique_digiliblt',\n", + " 'latin_text_corpus_grammaticorum_latinorum',\n", + " 'latin_text_poeti_ditalia',\n", + " 'lat_text_tesserae',\n", + " 'cltk_lat_lewis_elementary_lexicon']" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# get texts\n", + "\n", + "corpus_downloader = FetchCorpus(language=\"lat\")\n", + "corpus_downloader.list_corpora" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "9bcb9cbb-ca49-4a21-93db-d14c96b33c1d", + "metadata": {}, + "outputs": [], + "source": [ + "# corpus_downloader.import_corpus(\"lat_text_tesserae\") # downloads plain text files" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "cab0d102-f606-46c0-81c4-ec942c989430", + "metadata": {}, + "outputs": [], + "source": [ + "# variable assignment\n", + "\n", + "with open(\"/home/tim/cltk_data/lat/text/lat_text_tesserae/texts/caesar.de_bello_gallico.part.1.tess\") as fo:\n", + " ch1 = fo.read()\n", + "with open(\"/home/tim/cltk_data/lat/text/lat_text_tesserae/texts/caesar.de_bello_gallico.part.2.tess\") as fo:\n", + " ch2 = fo.read()\n", + "with open(\"/home/tim/cltk_data/lat/text/lat_text_tesserae/texts/caesar.de_bello_gallico.part.3.tess\") as fo:\n", + " ch3 = fo.read()\n", + "with open(\"/home/tim/cltk_data/lat/text/lat_text_tesserae/texts/caesar.de_bello_gallico.part.4.tess\") as fo:\n", + " ch4 = fo.read()\n", + "with open(\"/home/tim/cltk_data/lat/text/lat_text_tesserae/texts/caesar.de_bello_gallico.part.5.tess\") as fo:\n", + " ch5 = fo.read()\n", + "with open(\"/home/tim/cltk_data/lat/text/lat_text_tesserae/texts/caesar.de_bello_gallico.part.6.tess\") as fo:\n", + " ch6 = fo.read()\n", + "with open(\"/home/tim/cltk_data/lat/text/lat_text_tesserae/texts/caesar.de_bello_gallico.part.7.tess\") as fo:\n", + " ch7 = fo.read()\n", + "with open(\"/home/tim/cltk_data/lat/text/lat_text_tesserae/texts/caesar.de_bello_gallico.part.8.tess\") as fo:\n", + " ch8 = fo.read()" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "46cc146d-77ae-4242-869c-45910279a544", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "‎𐤀 CLTK version '1.1.6'.\n", + "Pipeline for language 'Latin' (ISO: 'lat'): `LatinNormalizeProcess`, `LatinStanzaProcess`, `LatinEmbeddingsProcess`, `StopsProcess`, `LatinLexiconProcess`.\n" + ] + } + ], + "source": [ + "# instantiate the pipeline\n", + "\n", + "cltk_nlp = NLP(language=\"lat\")" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "576affdd-cad6-485d-b0df-b97bd3b7611c", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[, , , ]\n" + ] + } + ], + "source": [ + "# pre-processing\n", + "# remove ``LatinLexiconProcess`` because it is slow (adds ~9 mins total)\n", + "\n", + "cltk_nlp.pipeline.processes.pop(-1)\n", + "print(cltk_nlp.pipeline.processes)" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "3e94698d-be62-4086-92d4-0b150a94fe72", + "metadata": {}, + "outputs": [], + "source": [ + "# process each text\n", + "\n", + "text1 = cltk_nlp.analyze(text = ch1)\n", + "text2 = cltk_nlp.analyze(text = ch2)\n", + "text3 = cltk_nlp.analyze(text = ch3)\n", + "text4 = cltk_nlp.analyze(text = ch4)\n", + "text5 = cltk_nlp.analyze(text = ch1)\n", + "text6 = cltk_nlp.analyze(text = ch6)\n", + "text7 = cltk_nlp.analyze(text = ch7)\n", + "text8 = cltk_nlp.analyze(text = ch8)" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "d8e781dc-9ab4-440e-90d9-15fb0ef4c93b", + "metadata": {}, + "outputs": [], + "source": [ + "# make a list of texts\n", + "\n", + "corpus = [text1.raw, text2.raw, text3.raw, text4.raw,\n", + " text5.raw, text6.raw, text7.raw, text8.raw]" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "b553b410-a903-426a-a369-9536e77b3348", + "metadata": {}, + "outputs": [], + "source": [ + "# instantiate the vectorizer\n", + "\n", + "vectorizer = TfidfVectorizer(min_df = 1.0, max_features = 400, use_idf = False)" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "e2af9d0e-6b18-43c2-8875-5a8f79f2d4df", + "metadata": {}, + "outputs": [], + "source": [ + "text_frequencies = vectorizer.fit_transform(corpus).todense()" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "7f1e86e8-f5a3-4d19-836e-affcd622004e", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(8, 295)" + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "text_frequencies.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "160b9960-a03f-4240-a20b-5a61550b6c40", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "numpy.matrix" + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "type(text_frequencies)" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "id": "def520f2-7563-418d-8c99-05558f08a25c", + "metadata": {}, + "outputs": [], + "source": [ + "# make the matrix a dataframe\n", + "\n", + "df = pd.DataFrame(text_frequencies)" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "id": "55b04e53-8374-4e86-b761-f5ae06df3341", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
0123456789...285286287288289290291292293294
00.0130700.0130700.0163380.0130700.0147040.0081690.0098030.0098030.0163380.008169...0.0049010.0114370.0032680.1209020.0408450.0032680.0016340.0049010.0065350.003268
10.0180430.0180430.0240580.0180430.0180430.0150360.0150360.0090220.0240580.009022...0.0060140.0210500.0090220.1172810.0240580.0030070.0030070.0090220.0120290.006014
20.0214410.0250150.0214410.0357350.0536030.0393090.0285880.0214410.0142940.010721...0.0178680.0071470.0071470.0929120.0214410.0035740.0035740.0107210.0071470.007147
30.0183970.0157690.0157690.0157690.0131410.0131410.0210250.0262820.0105130.010513...0.0078840.0078840.0052560.0841010.0262820.0052560.0052560.0078840.0078840.005256
40.0130700.0130700.0163380.0130700.0147040.0081690.0098030.0098030.0163380.008169...0.0049010.0114370.0032680.1209020.0408450.0032680.0016340.0049010.0065350.003268
50.0178450.0133840.0223060.0267680.0133840.0044610.0111530.0111530.0066920.008923...0.0066920.0111530.0089230.1115310.0066920.0066920.0022310.0066920.0022310.002231
60.0131130.0120200.0087420.0043710.0120200.0076490.0043710.0087420.0043710.006556...0.0054640.0065560.0021850.1059930.0043710.0032780.0010930.0054640.0021850.002185
70.0108650.0065190.0152110.0086920.0108650.0130380.0086920.0043460.0086920.017384...0.0195570.0152110.0021730.0890910.0021730.0021730.0021730.0043460.0021730.002173
\n", + "

8 rows × 295 columns

\n", + "
" + ], + "text/plain": [ + " 0 1 2 3 4 5 6 \\\n", + "0 0.013070 0.013070 0.016338 0.013070 0.014704 0.008169 0.009803 \n", + "1 0.018043 0.018043 0.024058 0.018043 0.018043 0.015036 0.015036 \n", + "2 0.021441 0.025015 0.021441 0.035735 0.053603 0.039309 0.028588 \n", + "3 0.018397 0.015769 0.015769 0.015769 0.013141 0.013141 0.021025 \n", + "4 0.013070 0.013070 0.016338 0.013070 0.014704 0.008169 0.009803 \n", + "5 0.017845 0.013384 0.022306 0.026768 0.013384 0.004461 0.011153 \n", + "6 0.013113 0.012020 0.008742 0.004371 0.012020 0.007649 0.004371 \n", + "7 0.010865 0.006519 0.015211 0.008692 0.010865 0.013038 0.008692 \n", + "\n", + " 7 8 9 ... 285 286 287 288 \\\n", + "0 0.009803 0.016338 0.008169 ... 0.004901 0.011437 0.003268 0.120902 \n", + "1 0.009022 0.024058 0.009022 ... 0.006014 0.021050 0.009022 0.117281 \n", + "2 0.021441 0.014294 0.010721 ... 0.017868 0.007147 0.007147 0.092912 \n", + "3 0.026282 0.010513 0.010513 ... 0.007884 0.007884 0.005256 0.084101 \n", + "4 0.009803 0.016338 0.008169 ... 0.004901 0.011437 0.003268 0.120902 \n", + "5 0.011153 0.006692 0.008923 ... 0.006692 0.011153 0.008923 0.111531 \n", + "6 0.008742 0.004371 0.006556 ... 0.005464 0.006556 0.002185 0.105993 \n", + "7 0.004346 0.008692 0.017384 ... 0.019557 0.015211 0.002173 0.089091 \n", + "\n", + " 289 290 291 292 293 294 \n", + "0 0.040845 0.003268 0.001634 0.004901 0.006535 0.003268 \n", + "1 0.024058 0.003007 0.003007 0.009022 0.012029 0.006014 \n", + "2 0.021441 0.003574 0.003574 0.010721 0.007147 0.007147 \n", + "3 0.026282 0.005256 0.005256 0.007884 0.007884 0.005256 \n", + "4 0.040845 0.003268 0.001634 0.004901 0.006535 0.003268 \n", + "5 0.006692 0.006692 0.002231 0.006692 0.002231 0.002231 \n", + "6 0.004371 0.003278 0.001093 0.005464 0.002185 0.002185 \n", + "7 0.002173 0.002173 0.002173 0.004346 0.002173 0.002173 \n", + "\n", + "[8 rows x 295 columns]" + ] + }, + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "id": "e349f960-4a18-4237-b780-98a912fbfb16", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(8, 295)" + ] + }, + "execution_count": 19, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "id": "f8a5160b-d739-40a7-80e8-f6672d1ed129", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/home/tim/predictive_analytics/DATA301_final_project/envs/lib/python3.9/site-packages/sklearn/utils/deprecation.py:87: FutureWarning: Function get_feature_names is deprecated; get_feature_names is deprecated in 1.0 and will be removed in 1.2. Please use get_feature_names_out instead.\n", + " warnings.warn(msg, category=FutureWarning)\n" + ] + } + ], + "source": [ + "# we need labels for columns and rows\n", + "\n", + "wordnames = vectorizer.get_feature_names() # columns\n", + "titles = ['ch1', 'ch2', 'ch3', 'ch4', 'ch5', 'ch6', 'ch7', 'ch8'] # rows\n", + "\n", + "df = pd.DataFrame(text_frequencies, columns = wordnames, index = titles)" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "id": "43b51e60-ad52-41b1-97a7-c5a98403f937", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
10111213141516171819...undiqueunumususututivenitventumverovivulneribus
ch10.0130700.0130700.0163380.0130700.0147040.0081690.0098030.0098030.0163380.008169...0.0049010.0114370.0032680.1209020.0408450.0032680.0016340.0049010.0065350.003268
ch20.0180430.0180430.0240580.0180430.0180430.0150360.0150360.0090220.0240580.009022...0.0060140.0210500.0090220.1172810.0240580.0030070.0030070.0090220.0120290.006014
ch30.0214410.0250150.0214410.0357350.0536030.0393090.0285880.0214410.0142940.010721...0.0178680.0071470.0071470.0929120.0214410.0035740.0035740.0107210.0071470.007147
ch40.0183970.0157690.0157690.0157690.0131410.0131410.0210250.0262820.0105130.010513...0.0078840.0078840.0052560.0841010.0262820.0052560.0052560.0078840.0078840.005256
ch50.0130700.0130700.0163380.0130700.0147040.0081690.0098030.0098030.0163380.008169...0.0049010.0114370.0032680.1209020.0408450.0032680.0016340.0049010.0065350.003268
ch60.0178450.0133840.0223060.0267680.0133840.0044610.0111530.0111530.0066920.008923...0.0066920.0111530.0089230.1115310.0066920.0066920.0022310.0066920.0022310.002231
ch70.0131130.0120200.0087420.0043710.0120200.0076490.0043710.0087420.0043710.006556...0.0054640.0065560.0021850.1059930.0043710.0032780.0010930.0054640.0021850.002185
ch80.0108650.0065190.0152110.0086920.0108650.0130380.0086920.0043460.0086920.017384...0.0195570.0152110.0021730.0890910.0021730.0021730.0021730.0043460.0021730.002173
\n", + "

8 rows × 295 columns

\n", + "
" + ], + "text/plain": [ + " 10 11 12 13 14 15 16 \\\n", + "ch1 0.013070 0.013070 0.016338 0.013070 0.014704 0.008169 0.009803 \n", + "ch2 0.018043 0.018043 0.024058 0.018043 0.018043 0.015036 0.015036 \n", + "ch3 0.021441 0.025015 0.021441 0.035735 0.053603 0.039309 0.028588 \n", + "ch4 0.018397 0.015769 0.015769 0.015769 0.013141 0.013141 0.021025 \n", + "ch5 0.013070 0.013070 0.016338 0.013070 0.014704 0.008169 0.009803 \n", + "ch6 0.017845 0.013384 0.022306 0.026768 0.013384 0.004461 0.011153 \n", + "ch7 0.013113 0.012020 0.008742 0.004371 0.012020 0.007649 0.004371 \n", + "ch8 0.010865 0.006519 0.015211 0.008692 0.010865 0.013038 0.008692 \n", + "\n", + " 17 18 19 ... undique unum usus \\\n", + "ch1 0.009803 0.016338 0.008169 ... 0.004901 0.011437 0.003268 \n", + "ch2 0.009022 0.024058 0.009022 ... 0.006014 0.021050 0.009022 \n", + "ch3 0.021441 0.014294 0.010721 ... 0.017868 0.007147 0.007147 \n", + "ch4 0.026282 0.010513 0.010513 ... 0.007884 0.007884 0.005256 \n", + "ch5 0.009803 0.016338 0.008169 ... 0.004901 0.011437 0.003268 \n", + "ch6 0.011153 0.006692 0.008923 ... 0.006692 0.011153 0.008923 \n", + "ch7 0.008742 0.004371 0.006556 ... 0.005464 0.006556 0.002185 \n", + "ch8 0.004346 0.008692 0.017384 ... 0.019557 0.015211 0.002173 \n", + "\n", + " ut uti venit ventum vero vi vulneribus \n", + "ch1 0.120902 0.040845 0.003268 0.001634 0.004901 0.006535 0.003268 \n", + "ch2 0.117281 0.024058 0.003007 0.003007 0.009022 0.012029 0.006014 \n", + "ch3 0.092912 0.021441 0.003574 0.003574 0.010721 0.007147 0.007147 \n", + "ch4 0.084101 0.026282 0.005256 0.005256 0.007884 0.007884 0.005256 \n", + "ch5 0.120902 0.040845 0.003268 0.001634 0.004901 0.006535 0.003268 \n", + "ch6 0.111531 0.006692 0.006692 0.002231 0.006692 0.002231 0.002231 \n", + "ch7 0.105993 0.004371 0.003278 0.001093 0.005464 0.002185 0.002185 \n", + "ch8 0.089091 0.002173 0.002173 0.002173 0.004346 0.002173 0.002173 \n", + "\n", + "[8 rows x 295 columns]" + ] + }, + "execution_count": 21, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "id": "2a231cde-34d1-4444-a681-93fcef31f5d3", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "ch1 0.070254\n", + "ch2 0.075180\n", + "ch3 0.039309\n", + "ch4 0.094613\n", + "ch5 0.070254\n", + "Name: caesar, dtype: float64" + ] + }, + "execution_count": 22, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df['caesar'].head() # how often does this word appear in each text?" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "id": "e907cbac-dd41-4959-9602-43aa1dcd0805", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 23, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "df['caesar'].plot(kind = 'bar')" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "id": "dc8ac837-e6e4-42a3-a249-546d14d03985", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(8, 295)" + ] + }, + "execution_count": 24, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# principal component analysis\n", + "\n", + "df.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "id": "c4fdc6a7-7531-4e73-84ad-2e91d5f60d35", + "metadata": {}, + "outputs": [], + "source": [ + "# we will collapse our 295 components (most frequent words) down to just 2\n", + "# so that we can more easily plot the data\n", + "\n", + "pca = PCA(n_components = 2) # we want 2 components as output" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "id": "9938feed-9a13-4a67-8269-6c10bef2f3e6", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/home/tim/predictive_analytics/DATA301_final_project/envs/lib/python3.9/site-packages/sklearn/utils/validation.py:727: FutureWarning: np.matrix usage is deprecated in 1.0 and will raise a TypeError in 1.2. Please convert to a numpy array with np.asarray. For more information see: https://numpy.org/doc/stable/reference/generated/numpy.matrix.html\n", + " warnings.warn(\n" + ] + } + ], + "source": [ + "corpus_reduced = pca.fit_transform(text_frequencies) # use the original matrix, not the dataframe" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "id": "699fd0a8-6a56-496d-be54-7a934d95a0a8", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(8, 2)" + ] + }, + "execution_count": 29, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "corpus_reduced.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "id": "a4c604a0-bd61-4e4a-8acb-3b87a15238d9", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([[ 0.17327172, 0.0398041 ],\n", + " [-0.01712081, -0.02159975],\n", + " [-0.04462241, -0.05325602],\n", + " [ 0.0080804 , -0.08425615],\n", + " [ 0.17327172, 0.0398041 ],\n", + " [-0.09377177, -0.04113045],\n", + " [-0.07100618, -0.07465533],\n", + " [-0.12810267, 0.19528951]])" + ] + }, + "execution_count": 30, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "corpus_reduced" + ] + }, + { + "cell_type": "code", + "execution_count": 40, + "id": "d0972829-9ffb-46bf-b6d2-925dbf79570a", + "metadata": {}, + "outputs": [], + "source": [ + "# make the matrix a dataframe\n", + "\n", + "# df_corpus_reduced = pd.DataFrame(corpus_reduced, columns = ['First Component', 'Second Component']),\n", + "\n", + "df_corpus_reduced = pd.DataFrame(corpus_reduced,\n", + " columns = ['First Component', 'Second Component'],\n", + " index = ['ch1', 'ch2', 'ch3', 'ch4', 'ch5', 'ch6', 'ch7', 'ch8'])" + ] + }, + { + "cell_type": "code", + "execution_count": 41, + "id": "a27ea5bd-ef41-44d8-87e3-2dfdf5ccdd66", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
First ComponentSecond Component
ch10.1732720.039804
ch2-0.017121-0.021600
ch3-0.044622-0.053256
ch40.008080-0.084256
ch50.1732720.039804
ch6-0.093772-0.041130
ch7-0.071006-0.074655
ch8-0.1281030.195290
\n", + "
" + ], + "text/plain": [ + " First Component Second Component\n", + "ch1 0.173272 0.039804\n", + "ch2 -0.017121 -0.021600\n", + "ch3 -0.044622 -0.053256\n", + "ch4 0.008080 -0.084256\n", + "ch5 0.173272 0.039804\n", + "ch6 -0.093772 -0.041130\n", + "ch7 -0.071006 -0.074655\n", + "ch8 -0.128103 0.195290" + ] + }, + "execution_count": 41, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df_corpus_reduced" + ] + }, + { + "cell_type": "code", + "execution_count": 56, + "id": "4d706c29-b70c-4602-88e0-b87e37b8b0c4", + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# plot the dataframe\n", + "\n", + "# ax = df_corpus_reduced.plot(kind = scatter, x = 'First Component', y = 'Second Component')\n", + "\n", + "# plt.scatter(x = df_corpus_reduced['First Component'], y = df_corpus_reduced['Second Component'])\n", + "\n", + "plt.scatter(x = df_corpus_reduced['First Component'], y = df_corpus_reduced['Second Component'])\n", + "\n", + "# annotate the points\n", + "\n", + "for title, points in df_corpus_reduced.iterrows():\n", + " plt.annotate(title, points)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6b88561d-14ff-40f3-b9d8-b4905550daf4", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.15" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}