mirror of https://github.com/coqui-ai/TTS.git
762 lines
68 KiB
Plaintext
762 lines
68 KiB
Plaintext
|
{
|
|||
|
"cells": [
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 1,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [],
|
|||
|
"source": [
|
|||
|
"import os\n",
|
|||
|
"import glob\n",
|
|||
|
"import librosa\n",
|
|||
|
"import numpy as np\n",
|
|||
|
"import pandas as pd\n",
|
|||
|
"from tqdm import tqdm_notebook as tqdm\n",
|
|||
|
"from multiprocessing import Pool\n",
|
|||
|
"from matplotlib import pylab as plt\n",
|
|||
|
"from collections import Counter\n",
|
|||
|
"%matplotlib inline"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 5,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [],
|
|||
|
"source": [
|
|||
|
"DATA_PATH = \"../../../Data/LJSpeech-1.1/wavs/\"\n",
|
|||
|
"META_PATH = \"../../../Data/LJSpeech-1.1/metadata.csv\"\n",
|
|||
|
"NUM_PROC = 8"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 6,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
" > Number of audio files: 13100\n"
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"file_names = glob.glob(os.path.join(DATA_PATH, \"*.wav\"))\n",
|
|||
|
"print(\" > Number of audio files: {}\".format(len(file_names)))"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 7,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [],
|
|||
|
"source": [
|
|||
|
"meta_f = open(META_PATH, 'r', encoding='utf8')\n",
|
|||
|
"meta = [m.split(\"|\") for m in meta_f.readlines()]"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 8,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"application/vnd.jupyter.widget-view+json": {
|
|||
|
"model_id": "f899c42f6f514ab9bf3834e5facef6a3",
|
|||
|
"version_major": 2,
|
|||
|
"version_minor": 0
|
|||
|
},
|
|||
|
"text/plain": [
|
|||
|
"HBox(children=(IntProgress(value=0, max=13100), HTML(value='')))"
|
|||
|
]
|
|||
|
},
|
|||
|
"metadata": {},
|
|||
|
"output_type": "display_data"
|
|||
|
},
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"\n"
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"def load_item(item):\n",
|
|||
|
" file_name = item[0]\n",
|
|||
|
" text = item[2]\n",
|
|||
|
" audio = librosa.load(os.path.join(DATA_PATH, file_name+'.wav'))\n",
|
|||
|
" sr = audio[1]\n",
|
|||
|
" audio = audio[0]\n",
|
|||
|
" audio_len = len(audio) / sr\n",
|
|||
|
" text_len = len(text)\n",
|
|||
|
" return text, text_len, audio, audio_len\n",
|
|||
|
"\n",
|
|||
|
"# This will take a while depending on size of dataset\n",
|
|||
|
"if NUM_PROC == 1:\n",
|
|||
|
" data = []\n",
|
|||
|
" for m in tqdm(meta):\n",
|
|||
|
" data += [load_item(m)]\n",
|
|||
|
"else:\n",
|
|||
|
" with Pool(8) as p:\n",
|
|||
|
" data = list(tqdm(p.imap(load_item, meta), total=len(meta)))"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 9,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"application/vnd.jupyter.widget-view+json": {
|
|||
|
"model_id": "e42aca59abe14f8bb32b5d5f19af1c67",
|
|||
|
"version_major": 2,
|
|||
|
"version_minor": 0
|
|||
|
},
|
|||
|
"text/plain": [
|
|||
|
"HBox(children=(IntProgress(value=0, max=13100), HTML(value='')))"
|
|||
|
]
|
|||
|
},
|
|||
|
"metadata": {},
|
|||
|
"output_type": "display_data"
|
|||
|
},
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"\n",
|
|||
|
" > Number of words: 22943\n"
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"# count words in the dataset\n",
|
|||
|
"w_count = Counter()\n",
|
|||
|
"for item in tqdm(data):\n",
|
|||
|
" text = item[0].lower()\n",
|
|||
|
" for word in text.split():\n",
|
|||
|
" w_count[word] += 1\n",
|
|||
|
"print(\" > Number of words: {}\".format(len(w_count)))"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 10,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"application/vnd.jupyter.widget-view+json": {
|
|||
|
"model_id": "647a2e1810324971aacb971acff91fb3",
|
|||
|
"version_major": 2,
|
|||
|
"version_minor": 0
|
|||
|
},
|
|||
|
"text/plain": [
|
|||
|
"HBox(children=(IntProgress(value=0, max=13100), HTML(value='')))"
|
|||
|
]
|
|||
|
},
|
|||
|
"metadata": {},
|
|||
|
"output_type": "display_data"
|
|||
|
},
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"\n"
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"text_vs_durs = {} # text length vs audio duration\n",
|
|||
|
"text_len_counter = Counter() # number of sentences with the keyed length\n",
|
|||
|
"for item in tqdm(data):\n",
|
|||
|
" text = item[0].lower()\n",
|
|||
|
" text_len = len(text)\n",
|
|||
|
" text_len_counter[text_len] += 1\n",
|
|||
|
" audio_len = item[-1]\n",
|
|||
|
" try:\n",
|
|||
|
" text_vs_durs[text_len] += [audio_len]\n",
|
|||
|
" except:\n",
|
|||
|
" text_vs_durs[text_len] = [audio_len]"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 11,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [],
|
|||
|
"source": [
|
|||
|
"# text_len vs avg_audio_len, median_audio_len, std_audio_len\n",
|
|||
|
"text_vs_avg = {}\n",
|
|||
|
"text_vs_median = {}\n",
|
|||
|
"text_vs_std = {}\n",
|
|||
|
"for key, durs in text_vs_durs.items():\n",
|
|||
|
" text_vs_avg[key] = np.mean(durs)\n",
|
|||
|
" text_vs_median[key] = np.median(durs)\n",
|
|||
|
" text_vs_std[key] = np.std(durs)"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"### Plot Dataset Statistics"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 12,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"text/plain": [
|
|||
|
"<matplotlib.collections.PathCollection at 0x7f2428497a90>"
|
|||
|
]
|
|||
|
},
|
|||
|
"execution_count": 12,
|
|||
|
"metadata": {},
|
|||
|
"output_type": "execute_result"
|
|||
|
},
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAXQAAAEICAYAAABPgw/pAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAHuRJREFUeJzt3X+cXXV95/HXO5ML3AmYCZBSMhiDVnGhrEDn0Y1VWRYoLGCARQlYULD2QX3UbhUxCNVCaGlBI0p3t4+6sbqIoBIjToPdinYr9SF9hJo4iSFCVuRH4CbAgIwgmYUh+e4f59xw5ub+OOf+/vF+Ph7zmHvPPXPu556Z+dzv/Xx/HIUQMDOz3jen0wGYmVlzOKGbmfUJJ3Qzsz7hhG5m1iec0M3M+oQTuplZn3BC7zOSgqTf6MDzniTpiXY/7yCQ9KikU+Pbfyrp75p03Hsk/UEzjpXy+ZoWu5XnhN5iyX/GJhzrFknXN+NYjerUG8egCyH8VQihbUm4XuXe4Hsl9l7mhG5mmSji3NGF/EtpIUlfARYDd0n6laQr4+1LJf2rpClJmyWdFG8/WNITkpbF9w+U9JCk90m6DLgIuDI+1l0pnn9/SZ+RtF3SU5I+LykfP3ZS/FxXSHpa0k5J70/87CGS7pL0vKQfSbpe0g/jx34Q77Y5juWCxM+VPV5JXBdI2lCy7XJJ6+LbZ0r6qaQXJBUkfazCcS6VdK+kz8Xn8mFJvxNvfzyO45KU52OBpG9LmpT0XHz7iMTP3iPpL+Lne0HSdyUdWiGuWsea9alN0kpJtyXuv1fSY5KelfSJkmOX7nu2pK3x679H0r8rF1O87+9KelDSLyX9D0BVjrsk/hQ2N/H6/1LSvcAu4PWS3i/pgfh8PCzpD+N95wH/CCyK/z5+JWlRltjjc/QxST+J471D0gGVXpvFQgj+auEX8ChwauL+KPAscCbRG+rvxvcXxo+fBjwJ/BrwBWBt4mdvAa6v8XwB+I349ueAdcDBwEHAXcAN8WMnAa8Afw7k4nh2AQvix78efw0DRwOPAz8s9zxpjlcS4zDwAvDGxLYfARfGt3cC74hvLwBOqPBaL42f8/3AEHA9sB34G2D/+Fy+AByY4nwcArwrju0g4BvAeOK57gF+DrwJyMf3b6wQV61jlf5NrARui28fDfwKODF+DZ+NX+OpZfZ9E/Ai0d9QDrgSeAjYr0xMh8bn4t3xvpfHx/2D0uPG95fEv+O5ide/HTgGmBsf4yzgDURvDP8x/n2fkPh7eKIkhtSxx+fo34BF8e/rAeCDnf5/7vavjgfQ719l/nk/DnylZJ+7gUsS9/87sAUoAIcktt9CyoQe/5O9CLwh8dhbgUfi2ycB08V/2Hjb08BSouQ4AxyVeOx6aif0sserEOdtwDXx7TfGyWY4vr8d+EPgNTVe66XAzxL3j43jOiyx7VnguFrno8yxjwOeS9y/B/hk4v4fAd9J+TdQeqzSv4lkorsG+HrisXnAy5RP6H8GrEnsOyf+mzmpTAzvA9Yn7gt4gmwJ/c9rvM5x4MOJv4dqCb1q7PE5ujjx+KeBzzf7/7Pfvlxyab/XAefHHzOnJE0BbwcOT+yzGvhN4JYQwrN1Ps9CohbixsTzfCfeXvRsCOGVxP1dwIHxPnOJWuVFyduVVDpeOV8F3hPf/j2iFuyu+P67iFr4j0n6F0lvrfKcTyVuTwOEEEq3FV9TxfMhaVjS/4xLHc8DPwBGJA0ljvVkmteW8liVLCJxrkMILxK9KVXa97HEvnvinx1NcdxAut9p0qz9JZ0hab2kX8Tn80yiTwJppIk91fm2Vzmht17pcpaPE7XQRxJf80IINwLE//SrgVuBP9LskSRZlsZ8hiiZHZN4nvkhhDT/FJNEH8ePSGx7bYbnTuN7wEJJxxEl9q8WHwgh/CiEcA5R2WkcWNOE56t1Pq4AjgL+QwjhNUQlD0jUmTOodawXid5cin49cXsniXMtaZiohFPODqIGQnFfxT9bKLNv6XHF7N9ptZiK9v79Sdof+CbwGaJPRCPA/+bV11jrbzVL7JaSE3rrPQW8PnH/NmCZpNMlDUk6QFEHZTF5/inRP8PvA6uAWxMtu9JjVRS3eL4AfE7SrwFIGpV0eoqf3Q3cCayMW5tvJvrIXu11ZRJCmCGqLa8iqpF+L45xP0kXSZof7/M8sKfe50k8X63zcRBRwp+SdDBwbQNPV+tYm4ALJeUkjRHVtYvWAu+U9HZJ+xH1SVT6P10DnCXpFEk5ojeSl4B/LbPvPwDHSDov7uj8E2Yn7U3AiZIWS5oPXF3jNe5HVOOfBF6RdAZRn0XRU8Ah8bEajd1SckJvvRuAT8Yf8z8WQngcOIcocU8StdhXAHMk/RbwUeB9cVL9FFFyvyo+1heBo+Njjad47o8TdTStjz/6/xNRyzGNPwbmE33s/QrwNaJ/uKKVwJfjWJanPGaprwKnAt8oKdW8F3g0jvmDRKN7mqHa+biZqLPzGWA9UTmmXrWO9WdEnYnPAdcx+9PJVuBD8bad8T5lJ2yFELYBFxP1uTwDLAOWhRBeLrPvM8D5wI1EJZw3AvcmHv8ecAfwE2Aj8O1qLzCE8ALRm8KaOMbfI+pwLj7+INHfzMPx38iiemO39BR3OJhVJelTwK+HEC6pubOZdYRb6FaWpDdL+veK/DbwAeBbnY7LzCqb2+kArGsdRPSReRFRPfQm4O87GpGZVeWSi5lZn3DJxcysT7S15HLooYeGJUuWtPMpzcx63saNG58JISystV9bE/qSJUvYsGFD7R3NzGwvSY/V3sslFzOzvuGEbmbWJ5zQzcz6hBO6mVmfcEI3M+sTTuhmZn3CU//NzFpkfKLAqru3sWNqmkUjeVacfhTnHl/u+iPN4YRuZtaASkl7fKLA1XduYXpmNwCFqWmuvnMLQMuSes2ELulLwDuBp0MIvxlvO5ho7eQlRNf+Wx5CeK4lEZqZdVgxaRemphmS2B0CoyN5/tObF/LNjYWySXvV3dv2bi+antnNqru3tSyh11ycS9KJRFchvzWR0D8N/CKEcKOkq4iu7P7xWk82NjYWPFPUzDqpVhmk9PHSpJ0kyl9rb3Qkz46p6bKPCXjkxrMyxSxpYwhhrNZ+NVvoIYQfSFpSsvkcoqt6A3yZ6IrgNRO6mVknVSqDbHjsF3z/wUkKU9OzknRhaprb12+veIHUStuLbwaFqel9Hls0km/0ZVRU7yiXw0IIO+PbTwKHVdpR0mWSNkjaMDk5WefTmZk1rlIZ5Pb12/cm39IkXc8C48WWfz43NGt7PjfEitPTXgUyu4Y7RUMIQVLF1xxCWE10FXvGxsa8+LqZNV2lGndpOWVHmRYz1Je0q3nxpVe4/I5NzM/nOCA3h6ldM109yuUpSYeHEHZKOhx4uplBmZmlVVpG2R33Cxamprn8jk185I5Ne5N88XurTU3P7P2ezw3xuQuOa2kiL6o3oa8DLiG6gvgl+NJkZtYmyU7L+fkcz/+/GfZUyNHFzcUk3kgyL9bWs745tHpkS1KaYYtfI+oAPVTSE8C1RIl8jaQPAI8By1sZpJkZ7NsaL7aE2yEQjV6596qTy8ZSTaVST7OlGeXyngoPndLkWMzM9pGsj3daMjEXW9zJIY4vvvRK2TeZVo5sSfJMUTPrGlnGgLdKtVJKaWI+9/jRfcawl7baWz2yJckJ3cxaLs2aJuXGiN+2fnvbY90TAjdfcFxdiblcq73VI1uSas4UbSbPFDUbPNVqzRKEUHnGZVa5OWLe/nMbqq0X6+TtXlirmqbNFDUzyyqZDOdUKWEUNzerWXngAXO5dtkxZVvX7/qtUb7/4GTVck6yFV5aTsmqE28ITuhm1hTJzstki7sd476LpnbNZCp7jL3u4LqSbpr1YNq90iK45GJmTZBlCF8zVOq4TA4rbJVKHZ83nHfs3mT9thv/ueyonHrjS1ty8RWLzKwh4xMFrlizuS3JPJ8b4uYLjuOm5W9p+zopRdWWxS2qNO681ePRXXIxs7qMTxRYuW5rSyb3DM0RB8Wdm9XWZulEp2WaZN2JlRbBCd3MMqhUJ69XuVEuC4ZzXLvsmJrJudFOy3qlSdYrTj+qI+PRndDNrKbxiQLX3bWV53a92hqvN5m3o87dSmmSdafGozuhm1lVzez
|
|||
|
"text/plain": [
|
|||
|
"<matplotlib.figure.Figure at 0x7f23b8395470>"
|
|||
|
]
|
|||
|
},
|
|||
|
"metadata": {},
|
|||
|
"output_type": "display_data"
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"plt.title(\"text length vs mean audio duration\")\n",
|
|||
|
"plt.scatter(list(text_vs_avg.keys()), list(text_vs_avg.values()))"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 13,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"text/plain": [
|
|||
|
"<matplotlib.collections.PathCollection at 0x7f24206c7358>"
|
|||
|
]
|
|||
|
},
|
|||
|
"execution_count": 13,
|
|||
|
"metadata": {},
|
|||
|
"output_type": "execute_result"
|
|||
|
},
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAXQAAAEICAYAAABPgw/pAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAIABJREFUeJzt3X2UXHWd5/H3J50CqwOTDpJxSGMMPuHCOAL2mY2jsgwyMoIB1gfAAQUfDuMZZ8YHRIOihFlmjUaU2d056+LqooIKIvYEZxR1kfHInDAmdjBGyIoCgSKEgDQgaaWT/u0f9954u1IP99bzw+d1Tp+uunXr3m/drv7Wr7739/tdhRAwM7P+t6DbAZiZWWs4oZuZDQgndDOzAeGEbmY2IJzQzcwGhBO6mdmAcELvY5KCpOd3Yb8nSHqg0/ttp/SxlPQZSR/pdkz1SLpX0knx7Q9J+t8t2u6tkt7Rim1l3F/LYh92C7sdwCCRdC/wjhDC91qwrauBB0IIlzS7rRbEEoAXhBDu7nYsnRBCeGe3Y8grhPBfux1DFpJOAK4JIRyeLOuX2PuBW+hm1hKKOKd0kQ9+i0j6ErAcuEnSryV9IF6+UtK/SZqWdEfcQkHSIZIekLQqvn+QpLslvUXSBcA5wAfibd2UYf8HSvqkpO2SdsZlg2L82Anxvi6U9LCkHZLemnruMyXdJOkJST+SdLmkH8aP/SBe7Y44lrNSz6u4vbK4zpK0sWzZeyWtj2+fIulnkp6UVJL0/irbOV/SbZI+HR/LX0r6k3j5/XEc52U5HvHjF8VxPyjpbWX7ulrS5fHtJZK+KWmXpMfi24en1r1V0n+JY3tS0nckHVrlNdTb1r4SSnx/jaRrUvffLOk+SY9K+nDZtsvXPU3S1vhY3SrpP1SKKV73zyTdJelxSf8DUI3trlBUnlqYev1/L+k2YDfwXElvlXRnfDx+Kekv43UXAd8ClsXvpV9LWpYn9vgYvV/ST+J4r5P0jGqvbeiEEPzToh/gXuCk1P1x4FHgFKIPzz+L7y+NH3818BDw+8BngRtSz70auLzO/gLw/Pj2p4H1wCHAwcBNwMfix04A9gB/BxTieHYDS+LHvxr/jAJHAfcDP6y0nyzbK4txFHiSqGSTLPsRcHZ8ewfwyvj2EuC4Kq/1/HifbwVGgMuB7cA/AgfGx/JJ4KAMx+PPgZ3AHwKLgC+XHct9xx54JvD6+HUcDHwNmEzFdSvwC+CFQDG+v7bKa6i3rfL3zxqi8gTx3+XXwPHx6/1UfDxOqrDuC4GniN5vBeADwN3AARViOjQ+bm+I131vvN13lG83vr8iPlYLU69/O3A0UQm3AJwKPI/og+E/xe+N41LvnQfKYsgce3yM/h1YFv9t7wTe2e3//V756XoAg/RT4R/yg8CXyta5GTgvdf+/A1uAEvDM1PJ9SaXG/gLw/Pgf5yngeanHXgbcE98+AZhJ/gnjZQ8DK4mS4yxwZOqxy6mf0Ctur0qc1wAfjW+/IE4go/H97cBfAr9X57WeD/w8df/FcVzPSi17FDgmw/H4PKmkGyeRigm9QhzHAI+l7t8KXJK6/1fAtzO+X8q3Vf7+WcPvEt1Hga+mHlsEPE3lhP4R4PrUugvi99cJFWJ4C7AhdV/AA+RL6H9X53VOAu9OvXdqJfSascfH6NzU458APtPs/+6g/Ljk0l7PAd4Yf3WcljQNvAI4LLXOVUQtxatDCI82uJ+lRK2+Tan9fDtenng0hLAndX83cFC8zkKiVnkifbuaatur5MvAm+Lbf0HUKt0d3389UQv/Pkn/KullNfa5M3V7BiCEUL4seU21jscy5r/G+6rtUNKopP8VlzqeAH4AjEkaSa32UOp21eOQcVvVzIs5hPAU0QdYtXXvS607Fz93PMN2A9n+/mnz1pf0GkkbJP0qPvanEH0TyCJL7JmO9zByQm+t8qkr7ydqoY+lfhaFENYCxP/IVwFfBP5K87sg5pkG8xGiZHZ0aj+LQwhZ3ui7iL5iH55a9uwc+87iu8BSSccQJfYvJw+EEH4UQjidqOw0CVzfgv3VOx47mP8al9fY1oXAkcB/DCH8HlHJA1J15hzqbespog+ixB+kbs+LWdIoUQmnkgeJGhPJuoqfW6qwbvl2xfxjUyumxL73qqQDga8DnyT69jQG/Au/e4313td5YrcyTuittRN4bur+NcAqSSdLGpH0DEUnKJPk+SGiN/jbgHXAF1OttfJtVRW3Yj4LfFrS7wNIGpd0cobn7gVuBNbELcgXEX0Nr/W6cgkhzBLVi9cR1T2/G8d4gKRzJC2O13kCmGt0P6n91Tse1wPnSzoqToyX1tjcwUQfDtOSDqmzbj31trUZOFtSQdIEUV07cQPwWkmvkHQA0fmLav+/1wOnSnqVpALRB8lvgX+rsO4/A0dLel18ovNvmZ+0NwPHS1ouaTFwcZ3XeABRjX8XsEfSa4jObyR2As+Mt9Vs7FbGCb21PgZcEn/Nf38I4X7gdKLEvYuoxX4RsEDSS4H3AW+Jk+rHiZL76nhbnwOOirc1mWHfHyQ6ebQh/jr/PaLWYBZ/DSwm+ir7JeArRP9EiTXAF+JYzsy4zXJfBk4CvlZWqnkzcG8c8zuJeve0QtXjEUL4FnAlcEu8zi01tnMl0cnOR4ANRKWbRtXb1keITiY+BlzG/G8yW4F3xct2xOtUHNwVQtgGnEt0fuYRYBWwKoTwdIV1HwHeCKwlKuG8ALgt9fh3geuAnwCbgG/WeoEhhCeJPhSuj2P8C6KT08njdxG9v34Zv5+WNRq77U/xiQWzfSR9HPiDEMJ5dVc2s57hFroh6UWS/kiRPwbeDnyj23GZWT4e+m8Q1Xa/QtTDYCdwBfBPXY3IzHJzycXMbEC45GJmNiA6WnI59NBDw4oVKzq5SzOzvrdp06ZHQghL663X0YS+YsUKNm7cWH9FMzPbR1LV0cxpLrmYmQ0IJ3QzswHhhG5mNiCc0M3MBoQTupnZgHBCNzMbEB76b2bWJpNTJdbdvI0Hp2dYNlbkopOP5IxjK11npDWc0M3M6qiVmKs9NjlV4uIbtzAzuxeA0vQMF9+4BaBtSb3uXC6SPg+8Fng4hPCH8bJDiOZIXkF0jb8zQwiP1dvZxMRE8MAiM+umvK3m8sScWDJa4NQ/OoyvbyrNe6xYGOFjr3sx627eRml6Zr/tjY8VuW31iblilrQphDBRd70MCf14oquNfzGV0D8B/CqEsFbSaqKrvX+w3s6c0M2smyol5yQBV2tx7356D4/tnq24PVH5mnrjY0UenJ6p+JiAe9aemivurAm97knREMIPgF+VLT4d+EJ8+wvAGbmiMzNrg8mpEi9fewtHrP5nXr72Fian5l+KdN3N2/Zrac/M7mXdzdv2Pf/iG7dQipNxaXqmajKH6hdITT4MKqm2vBUaraE/K4SwI779EPCsaitKugC4AGD58lrX4jUza8zkVInLbto6L/lWqlk/WKEEkqz78rW3VCyRNCIp5VT6NnDRyVmvDJlfpvnQJa0AvpkquUzHV/NOHn8shLCk3nZccjGzvOrVvKvVuBNjxQKLDlzIg9MzLJDY24FrQIwVCzw+M8viYgEJpnfPNtXLJWvJpdEW+k5Jh4UQdkg6DHi4we2YmVVVqafIRTfcwZr1W/clzCd+M8tcjRw9PTPL9EzUcm80mQsoFhawe3Yu0/rJ/qZnZikWRvj0Wce0tbtiotGBReuB5ALC5+HLlZlZG1Sqec/uDUzPzBKIEmatZN4qAQiIc1cuZ3ysiIARKdNz0zX6dqvbQpf0FeAE4FBJDwCXAmuB6yW9HbgPOLOdQZpZf8vaVTBZrzQ9w0iHyiNZzczu5ft37drX5bBeqSetWu2+1eom9BDCm6o89KoWx2JmA6jeAJt0Ek93A+ylZJ5IJ+bkAyn9QfXUb/fsK7ektbNnS5pHippZW9XrKphO9r2QwiWo9llSnpjPOHa87gnadvdsSXNCN7OWS5dYqiXp0vQM77luc0v3W22gT611k9LOeI1h+5AtMVdqtbd7/pY0J3Qza6k8teVWC0SjNKv1Jx+RmAuhbqJtJjGXt9o7yQndzFoiXQvvlqSVfdENdzC7d35bvbBArHvjSzIn22YTc6dnWgQndDNrgXa2ypOSyFhqkM7YaIF
|
|||
|
"text/plain": [
|
|||
|
"<matplotlib.figure.Figure at 0x7f23ba4515f8>"
|
|||
|
]
|
|||
|
},
|
|||
|
"metadata": {},
|
|||
|
"output_type": "display_data"
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"plt.title(\"text length vs median audio duration\")\n",
|
|||
|
"plt.scatter(list(text_vs_median.keys()), list(text_vs_median.values()))"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 14,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"text/plain": [
|
|||
|
"<matplotlib.collections.PathCollection at 0x7f242065e8d0>"
|
|||
|
]
|
|||
|
},
|
|||
|
"execution_count": 14,
|
|||
|
"metadata": {},
|
|||
|
"output_type": "execute_result"
|
|||
|
},
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAXcAAAEICAYAAACktLTqAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAIABJREFUeJzt3X2UHOV15/Hv1aiBERAGgpKYAYGSEBQTYmQUzB4lWSAYMF5AgQSB7djZOMtmE/YsGCsrr31AYZ1FjpaY7Fl2syTh+CU4CIw9kQ8k4ASSbNjgaOSRYouAQ3hVQ2wZGNswA5qR7v7R1aKmp6q6qru6u6r69zlnjmaqS93PVPfceuo+93nK3B0REamWJYNugIiI5E/BXUSkghTcRUQqSMFdRKSCFNxFRCpIwV1EpIIU3KU0zMzN7EcH8Lpnm9mefr+uSDcU3KVjZvaMmZ2X03N9ysw+nsdzdWtQJ5GWNhxiZreY2R4zezU41rcGj70a+jpgZrOhn99rZpvMbM7Mvhd8fcPM/qeZvWWQv5P0l4K7SDF9BFgDnAkcCZwNfBXA3Y9ofgHPAReHtt0Z/P+t7n4kcAzw88APATsU4IeHgrt0xMw+C6wAvhT0GH8z2H6Wmf0/M5s2s11mdnaw/ZigF3px8PMRZvakmb3fzK4G3gv8ZvBcX0rx+oea2X83s+fM7Jtm9vtmNho8dnbwWteb2bfM7EUz+7eh//v9ZvYlM/uumW03s4+b2d8Gj/1NsNuuoC3rQ/8v8vla2rXezCZbtl1nZtuC7y8ys8eCHnXdzD4c8yv+FPBFd3/BG55x98+0Oy6t3H3O3XcD64G9wPVZn0NKyt31pa+OvoBngPNCP48DLwEX0eg4vDP4eXnw+PnAvwA/APwB8PnQ//0U8PE2r+fAjwbffxLYRqNneiTwJeDm4LGzgXngJqAWtGcGODp4/K7gaxnwVuB54G+jXifN87W0cRnwPeDk0LbtwJXB9y8CPxN8fzTw9pjf9WM0euW/DpwGWJr3INi2CfjjiH1vAr4y6M+NvvrzpZ675Ol9wP3ufr+7H3D3LwOTNIIh7v4gcA/wl8G2f9/Ji5iZAVcD17n7y+7+PeC/AVeGdpsDbvJGz/V+4FXgFDMbAS4HbnT3GXd/DPh0ipeNfL7Wndx9BvhT4KqgrScDq2iciJrP81Yz+z53f8XdvxrzejcDn6BxRTMJ1M3sAynameQFGidDGQIK7pKnE4FfDFIy02Y2Dfw0EM7z3g78BPApd3+pw9dZTqOHvCP0On8ebG96yd3nQz/PAEcE+yyl0VtvCn8fJ+75onyOILgD7wEmgqAPjRPLRcCzZvbXZvavop7A3fe7+23uvhYYA34buMPMfjxFW+OMAy938f+lRBTcpRutS4o+D3zW3cdCX4e7+2aAoNd8O/AZ4NdbKlKyLE/6bWAWODX0Okd5Y4Cxnb00UizHh7adkOG10/gysNzMTqcR5D/XfMDdt7v7pTRSUxPA3e2ezN1n3f024BUaaaTMzGwJcDHwfzv5/1I+Cu7SjW8CPxz6+Y+Bi83sAjMbMbPDgsHNZiD9LzSC+K8AW4DPBAE/6rliufsBGjn7T5rZDwCY2biZXZDi/+4HvgBsMrNlZrYKeH+b3ysTd5+jkX7aQiMN8uWgjYcEpYpHBft8FzgQ9Rxmdm1w7EbNbGmQkjkSmMrSluD//jjwJzQqZn63099LykXBXbpxM/CxIDXyYXd/HriURhDfS6MnvwFYYmZnAB8C3h8E2E/QCPQbg+f6Ixq56Gkzm0jx2v8ZeBJ41My+C/wFETnwGNcAR9EY3P0sjcD3RujxTcCng7ZckfI5W30OOA+4pyWd80vAM0Gbf41GTj3KDHBL0MZvA78BXO7uT6V8/fVm9irwHRr5/peAM9z9hcy/iZSSuetmHTLczOwTwA+5e7cDliKFoZ67DB0zW2VmP2kNZwIfBL446HaJ5GnpoBsgMgBH0kjFHEcjv34LjfJFkcpQWkZEpIKUlhERqaCBpWWOPfZYP+mkkwb18iIipbRjx45vu/vydvsNLLifdNJJTE5Ott9RREQOMrNn0+yntIyISAUpuIuIVJCCu4hIBSm4i4hUkIK7iEgFKbiLiFSQgruISAUpuIuIVJAWDiu4iak6Wx54ghemZzlubJQNF5zCutXjg26WiBScgnuBTUzV+cgXvsbs3H4A6tOzfOQLXwNQgBeRRErLFNiWB544GNibZuf2s+WBJwbUIhEpCwX3AnthejbTdhGRJgX3AjtubDTTdhGRJgX3AttwwSmM1kYWbButjbDhgrT3gRaRYaUB1QJrDpqqWkZEslJwL7h1q8cVzCtIJa7SawruIn2mElfpB+XcRfpMJa7SDwruIn2mElfph1TB3cwuNLMnzOxJM9sY8fgKM3vYzKbM7B/M7KL8mypSDSpxlX5oG9zNbAS4DXgX8FbgKjN7a8tuHwPudvfVwJXA/8q7oSJVoRJX6Yc0A6pnAk+6+1MAZnYXcCnwWGgfB74v+P4o4IU8GylSJSpxlX5IE9zHgedDP+8B3tGyzybgQTP7j8DhwHlRT2RmVwNXA6xYsSJrW0UqQyWu0mt5DaheBXzK3Y8HLgI+a2aLntvdb3f3Ne6+Zvny5Tm9tIiItEoT3OvACaGfjw+2hX0QuBvA3f8OOAw4No8GiohIdmnSMtuBk81sJY2gfiXwnpZ9ngN+DviUmf04jeC+N8+GihSNZplKkbUN7u4+b2bXAA8AI8Ad7r7bzG4CJt19G3A98Admdh2NwdVfdnfvZcNFBimvWaY6QUiv2KBi8Jo1a3xycnIgry3SrbWbH6IeMelofGyURzaem+o5Wk8Q0CiJvPmy0xTgJZaZ7XD3Ne3209oyIh3oZpZps7cedXKYndvPpm271ZuXrim4y9BLmxoJ77fEjP0RV73tZplG9dZbTc/OMT07B2hRMemc1paRodYMtvXpWZw3g+nEVD1xv6jAnmaWadSiYe1oUTHphIK7DLW0KzTGBeURM4xGrj1NrrzTxcG0qJhkpbSMDLW0ufO4/Q648/Tmd6d+vePGRiNz7dA4Qczsm+eVmbnI/yeShXruMtTSrtCY10qOcYuG3br+dB7ZeC43XnxqqkXFJqbqrN38ECs33sfazQ8tSiOJKLjLUEu7QmNeKzmuWz3OzZedxvjYaGQ6p93jkH6cQIab0jIy1LKs0HhYbcnBvPvYaI1Nl5yaqoIlqhonqRa+3aJiSeMEqqiRJgV3GXrtgmlU+eIb8wdSPXcv7peqOzlJGkrLiLTR6T1PJ6bqXH/3rtzvlxqX519iphy8HKTgLtJGJz3lZo89qh4+6v9mGSCNyv9Do/ZeOXhpUlpGpI248sVwD7o1rz6zbz5xslLr/82SumkdJ4iaLascvCi4SyXltdrixFSd196YX7Q9XCkTFZyTtFbZxKV9rr97FxAf4JvbV268L/J1lIMfbgruUjl5LscbtQ7M0ctq3HjxqQt60FmWFAjn3NetHo8NwvvdU7W73ZVF1Imu2W4tTlZdyrlL5XQ6AJrmeQCWHbJ0QSDspIcczosnTYRK0+6oHLwB56xaHlkTv+Hzu9hwzy7VyVecgrsMVC9mWuZVKpj2eeKC89ho7eBkpBGzRY83A3fcAGm7djStWz3O5WeME34FB+7dUWfTtt2LTlBz+525A9E5eqkOpWVkYHpRAw7pBkDzfJ4NF5wSedON8CSnpLx4c5/r794VWV1z1GiNtZsf4oXpWY4arWEG0zNzC9IpDz++l9b/OTu3P1O6qN1YgZSLeu4yMHmlT1p1s1RA+EritTfmqY0s7HFHPU+aJQParU2zbvU4t1zxtkXtri0xXts3fzCFMj07xyszc4vSKXkMnhooNVMhus2eDMzKjfct6m1CI8hkWWkxSusg4jmrlvPw43sTBxA/NvE17nz0uQVtqi0xjjhs6aKeciftSXNLvaiSyqhVIsNGYm4cAo3B39fnDix43dqIMbc/ev8stwmUwdBt9qTw8kqfRAmXCsalfyaffflgwB9bVosMonMHnGWHLGXqhvO7bg+0r1BpXQohLp0TFhfYR2sj3HjxqZGve+3WnZH/R+WT1aHgLgMTl6vOutJiO3Hpn3AvPal3nFfAa7eGTZSk9d+TjLecPFpfN+4erseNjeY
|
|||
|
"text/plain": [
|
|||
|
"<matplotlib.figure.Figure at 0x7f24206fe630>"
|
|||
|
]
|
|||
|
},
|
|||
|
"metadata": {},
|
|||
|
"output_type": "display_data"
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"plt.title(\"text length vs STD\")\n",
|
|||
|
"plt.scatter(list(text_vs_std.keys()), list(text_vs_std.values()))"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 15,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"text/plain": [
|
|||
|
"<matplotlib.collections.PathCollection at 0x7f24205f2358>"
|
|||
|
]
|
|||
|
},
|
|||
|
"execution_count": 15,
|
|||
|
"metadata": {},
|
|||
|
"output_type": "execute_result"
|
|||
|
},
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAXoAAAEICAYAAABRSj9aAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAIABJREFUeJzt3X2UHHWd7/H3N0MDE0SGyCxLhodExHBFlwTmetkbdQEfIrBClJWH9YFV76LnrucsrEbD6lmz6F2jWRZ3r3vVsHIQQQwajFG8C6xh17vcjTgxAwEhF5DwMAYYgQExI06S7/2jqiY1PVXV1c/d1Z/XOXOmu7q7+tfVM9/61ff3ZO6OiIgU15x2F0BERJpLgV5EpOAU6EVECk6BXkSk4BToRUQKToFeRKTgFOilKczMzewVbXjfU83s8Va/by3M7C/N7J/aXQ4pPgX6HmFmO8zsTQ3a1zVm9plG7Kte7TqhpDGzM83sG+Hta83s7LTnuvvfuPt/q/P9FoTHYL969iPFpkAv0lgnAyOx2z9tY1lEAAX6nmBmXweOBr5nZi+Y2cfC7aeY2f81swkzu8vMTg23zzOzx83sbeH9l5jZg2b2XjO7GHgX8LFwX9/L8f4HmNnfmtmjZvakmX3ZzPrDx04N3+sjZvaUme00s/fFXvsyM/uemT1vZj8xs8+Y2b+Hj/0ofNpdYVnOj70ucX9l5TrfzEbKtl1qZhvD22ea2c/M7FdmNmZmH81xuIeBLWZ2EDDP3VPTSGa2ysyuC29HNfOLwuP0SzP7ROy5rzWzkfA4PGlmfxc+FB2DifAY/L6ZHWtmm8zs6XA/15vZQGxfO8zso2Z2t5k9Z2brzOzA2OPnmNlo+F4Pmdlbw+2HmNlXw2M6Fn4XfeFjrzCzfwv390szW5fjWEmruLt+euAH2AG8KXZ/CHgaOJPghP/m8P5g+PhbgCeA3wGuAr4de+01wGcqvJ8DrwhvXwlsBOYBBwPfAz4bPnYqsBu4HCiF5dkFHBo+/s3wZy7wKuAx4N+T3ifP/srKOBf4FXBcbNtPgAvC2zuB14e3DwVOyvi824EJYA/wHPACMBVu+0rKa1YB14W3F4Sf5SqgHzgReBH4T+Hj/wG8J7z9EuCUstftF9vvK8Lv8wBgkOBk8IWyv4U7gfnhd3If8KHwsdeG5X9z+HcxBBwfPvYd4CvAQeHfxZ3AB8PHbgA+Eb7mQOB17f6b10/sb63dBdBPi77o2YH+48DXy55zC3BR7P7/BLYBY8DLYtuvIWegBwz4NXBs7LHfBx4Ob58KTJYFqqeAU4C+MFguij32GSoH+sT9pZTzOuCvwtvHEQT+ueH9R4EPAi/NeYzfBHwnvL0WeGeF569idqA/Mvb4new76fwI+GvgsLJ9zAr0Ce+zHNha9rfw7tj9zwNfDm9/BbgyYR+HE5x4+mPbLgRuD29fG37mI9PKoZ/2/Sh107uOAd4Zpm0mzGwCeB1wROw5a4FXA9e4+9M1vs8gQc15S+x9/jncHnna3XfH7u8iqLUOAvsR1OIj8dtp0vaX5BsEAQvgj4EN7r4rvH8uwRXBI2Fa4veTdmBmnw8/183AW8LbHwCuMrMncpQ3Lv78eLk/ALwSuD9MYf1h2g7M7HAz+2aYXnme4GR2WM73OQp4KGG3xxBcIe2MfY9fIajZA3yM4KR+p5nda2bvr/RBpXUU6HtH+TSljxHU6AdiPwe5+2qAMPe6lqCm9t/LerZUM+XpLwlq2CfE3ucQd08LvHHjBGmYI2PbjqrivfO4DRg0s8UEAf8b0QPu/hN3P4cgmG0Abkzagbt/zN0HgIcJrmL+APiP8LP+biMK6e4PuPuFYVk+B3w7bAdI+i7+Jtz+Gnd/KfBugiCcx2PAsSnbXyS4ooi+x5e6+wlh+Z5w9z919/kEV0H/yzqoN1SvU6DvHU8CL4/dvw54m5ktM7M+MzswbBiNgupfEgSL9wNrgGujhreEfaVy970Eeecrzex3AMxsyMyW5XjtHuAmYJWZzTWz44H3VvhcVXH3KeBbBJ9xHkHgx8z2N7N3mdkh4XOeB/am7cfMDgYOdvedwEns63nTEGb2bjMbDI/nRLh5L8HJcC8zj8HBBG0Ez5nZELCiirf6KvA+M3ujmc0Jv6vjw891K3CFmb00fOxYM/uDsHzvjP3tPEvwt5N6vKS1FOh7x2eBT4aX3R9198eAcwgC+jhBjW0FMMfMTgb+AnhvGGw/R/CPuzLc11eBV4X72pDjvT8OPAhsDlMJ/wIsylnuDwOHEKQavk7Q6Pdi7PFVwNfCspyXc5/lvkGQX/9WWcrnPcCOsMwfIuhtlGYJMBrePgnYUmNZ0rwVuNfMXgD+niB3Pxmmmf4HcEd4DE4hyOWfRNCoejPByTIXd78TeB9BA/pzwL8RpG0gOMnuD/yMIJh/m32pvv8M/Dgs30bgz93953V8Xmkgc9fCI9I9zOxzwO+6+0XtLotIt1CNXjqamR1vZr9ngdcSNEp+p93lEukmGjYtne5ggnTNfIJ8/BXAd9taIpEuo9SNiEjBKXUjIlJwHZG6Oeyww3zBggXtLoaISFfZsmXLL919sNLzOiLQL1iwgJGRhnY7FhEpPDN7JM/zlLoRESk4BXoRkYJToBcRKTgFehGRglOgFxEpuI7odSMi2TZsHWPNLdv5xcQk8wf6WbFsEcuXDLW7WNIlFOhFOtyGrWNcdtM2Jqf2ADA2McllN20DULCXXCqmbszsagsWWb4ntm1duHjwaLjQ8Gi4fYGZTcYe+3IzCy/SC9bcsn06yEcmp/aw5pbtbSqRdJs8NfprgC8SrDQEgLufH902sysI5q2OPOTuixtVQJFe94uJyaq2i5SrWKN39x8BzyQ9ZmYGnEcwu6CINMH8gf6qtouUq7fXzeuBJ939gdi2hWa2NVxM+fV17l+k561Ytoj+Ut+Mbf2lPlYsy7tIl/S6ehtjL2RmbX4ncLS7Px0uR7fBzE5w9+fLX2hmFwMXAxx99NF1FkOkuKIGV/W6kVrlmo/ezBYA33f3V8e27QeMASe7++Mpr/tX4KPunjlj2fDwsGtSMxGR6pjZFncfrvS8elI3bwLujwd5Mxs0s77w9suB4wAtECwi0kZ5ulfeAPwHsMjMHjezD4QPXcDsRtg3AHeH3S2/DXzI3RMbckVEpDUq5ujd/cKU7X+SsG09sL7+YomISKNorhsRkYJToBcRKTgFehGRglOgFxEpOAV6EZGCU6AXESk4BXoRkYJToBcRKTgFehGRglOgFxEpOAV6EZGCU6AXESk4BXoRkYJToBcRKTgFehGRglOgFxEpOAV6EZGCU6AXESk4BXoRkYLLszj41Wb2lJndE9u2yszGzGw0/Dkz9thlZvagmW03s2XNKriIiORTcXFw4Brgi8C1ZduvdPe/jW8ws1cBFwAnAPOBfzGzV7r7ngaUVaRlNmwdY80t2/nFxCTzB/pZsWwRy5cMteV9WlUWKa6KNXp3/xHwTM79nQN8091fdPeHgQeB19ZRPpGW27B1jMtu2sbYxCQOjE1Mcsm6UZZcfisbto419X0uu2nbjPfI8xyRSurJ0X/YzO4OUzuHhtuGgMdiz3k83CbSNdbcsp3JqdkXoc/umuKym7bxyQ3bWLp6EwtX3szS1ZtqDrpJ7zM5tYc1t2yv6jkildQa6L8EHAssBnYCV1S7AzO72MxGzGxkfHy8xmKINN4vJiZTH5uc2sP1mx9tSA077X3i2/M8R6SSmgK9uz/p7nvcfS9wFfvSM2PAUbGnHhluS9rHWncfdvfhwcHBWooh0hTzB/ozH/ey+7XWsNPeJ749z3NEKqkp0JvZEbG7bweiHjkbgQvM7AAzWwgcB9xZXxFFWmvFskX0l/qqek0tNeyk9+kv9bFi2aKqnrNh61hDUklSXBV73ZjZDcCpwGFm9jjwKeBUM1tMULnZAXwQwN3vNbMbgZ8Bu4E/U48b6TZRj5ZVG+9lYnJqxmPG7Bo97KthV9NDJtqe9fxKz4kaa6M8fpRKir9WxNyT/mxba3h42EdGRtpdDJFZygP3accPsn7L2IwG0v5SH599x2sAZgTd+GN5gm4t3SiXrt7EWMLVxNBAP3esPD3vx5QuZWZb3H240vPy9KMX6VnLlwzNCrbDx8xLDMhLV29
|
|||
|
"text/plain": [
|
|||
|
"<matplotlib.figure.Figure at 0x7f2420673710>"
|
|||
|
]
|
|||
|
},
|
|||
|
"metadata": {},
|
|||
|
"output_type": "display_data"
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"plt.title(\"text length vs # instances\")\n",
|
|||
|
"plt.scatter(list(text_len_counter.keys()), list(text_len_counter.values()))"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"### Check words frequencies"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 17,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [],
|
|||
|
"source": [
|
|||
|
"w_count_df = pd.DataFrame.from_dict(w_count, orient='index')\n",
|
|||
|
"w_count_df.sort_values(0, ascending=False, inplace=True)"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 18,
|
|||
|
"metadata": {
|
|||
|
"scrolled": true
|
|||
|
},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"text/html": [
|
|||
|
"<div>\n",
|
|||
|
"<style scoped>\n",
|
|||
|
" .dataframe tbody tr th:only-of-type {\n",
|
|||
|
" vertical-align: middle;\n",
|
|||
|
" }\n",
|
|||
|
"\n",
|
|||
|
" .dataframe tbody tr th {\n",
|
|||
|
" vertical-align: top;\n",
|
|||
|
" }\n",
|
|||
|
"\n",
|
|||
|
" .dataframe thead th {\n",
|
|||
|
" text-align: right;\n",
|
|||
|
" }\n",
|
|||
|
"</style>\n",
|
|||
|
"<table border=\"1\" class=\"dataframe\">\n",
|
|||
|
" <thead>\n",
|
|||
|
" <tr style=\"text-align: right;\">\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th>0</th>\n",
|
|||
|
" </tr>\n",
|
|||
|
" </thead>\n",
|
|||
|
" <tbody>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>the</th>\n",
|
|||
|
" <td>18299</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>of</th>\n",
|
|||
|
" <td>8709</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>and</th>\n",
|
|||
|
" <td>6402</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>to</th>\n",
|
|||
|
" <td>6282</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>in</th>\n",
|
|||
|
" <td>4778</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>a</th>\n",
|
|||
|
" <td>4279</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>was</th>\n",
|
|||
|
" <td>3731</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>that</th>\n",
|
|||
|
" <td>2888</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>he</th>\n",
|
|||
|
" <td>2711</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>his</th>\n",
|
|||
|
" <td>2023</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>for</th>\n",
|
|||
|
" <td>1779</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>on</th>\n",
|
|||
|
" <td>1768</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>had</th>\n",
|
|||
|
" <td>1628</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>as</th>\n",
|
|||
|
" <td>1589</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>with</th>\n",
|
|||
|
" <td>1524</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>by</th>\n",
|
|||
|
" <td>1519</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>at</th>\n",
|
|||
|
" <td>1463</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>were</th>\n",
|
|||
|
" <td>1435</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>it</th>\n",
|
|||
|
" <td>1362</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>which</th>\n",
|
|||
|
" <td>1305</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>be</th>\n",
|
|||
|
" <td>1135</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>from</th>\n",
|
|||
|
" <td>1024</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>not</th>\n",
|
|||
|
" <td>1014</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>this</th>\n",
|
|||
|
" <td>992</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>is</th>\n",
|
|||
|
" <td>937</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>or</th>\n",
|
|||
|
" <td>932</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>but</th>\n",
|
|||
|
" <td>874</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>one</th>\n",
|
|||
|
" <td>782</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>have</th>\n",
|
|||
|
" <td>780</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>oswald</th>\n",
|
|||
|
" <td>776</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>...</th>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>eighteen:</th>\n",
|
|||
|
" <td>1</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>lading</th>\n",
|
|||
|
" <td>1</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>sustain</th>\n",
|
|||
|
" <td>1</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>inflict,</th>\n",
|
|||
|
" <td>1</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>markets,</th>\n",
|
|||
|
" <td>1</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>blow.</th>\n",
|
|||
|
" <td>1</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>ill-health</th>\n",
|
|||
|
" <td>1</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>delirium</th>\n",
|
|||
|
" <td>1</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>tremens,</th>\n",
|
|||
|
" <td>1</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>dejection,</th>\n",
|
|||
|
" <td>1</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>sacking</th>\n",
|
|||
|
" <td>1</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>prize-fighter</th>\n",
|
|||
|
" <td>1</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>scandalized</th>\n",
|
|||
|
" <td>1</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>outshone</th>\n",
|
|||
|
" <td>1</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>ferdinand</th>\n",
|
|||
|
" <td>1</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>grain.</th>\n",
|
|||
|
" <td>1</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>fluctuations</th>\n",
|
|||
|
" <td>1</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>attempt\"</th>\n",
|
|||
|
" <td>1</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>action;</th>\n",
|
|||
|
" <td>1</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>grains,</th>\n",
|
|||
|
" <td>1</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>prices,</th>\n",
|
|||
|
" <td>1</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>protectionists</th>\n",
|
|||
|
" <td>1</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>depress</th>\n",
|
|||
|
" <td>1</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>market,</th>\n",
|
|||
|
" <td>1</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>election;</th>\n",
|
|||
|
" <td>1</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>napoleon</th>\n",
|
|||
|
" <td>1</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>french,</th>\n",
|
|||
|
" <td>1</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>popularity</th>\n",
|
|||
|
" <td>1</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>\"convulsive</th>\n",
|
|||
|
" <td>1</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>lessening</th>\n",
|
|||
|
" <td>1</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" </tbody>\n",
|
|||
|
"</table>\n",
|
|||
|
"<p>22943 rows × 1 columns</p>\n",
|
|||
|
"</div>"
|
|||
|
],
|
|||
|
"text/plain": [
|
|||
|
" 0\n",
|
|||
|
"the 18299\n",
|
|||
|
"of 8709\n",
|
|||
|
"and 6402\n",
|
|||
|
"to 6282\n",
|
|||
|
"in 4778\n",
|
|||
|
"a 4279\n",
|
|||
|
"was 3731\n",
|
|||
|
"that 2888\n",
|
|||
|
"he 2711\n",
|
|||
|
"his 2023\n",
|
|||
|
"for 1779\n",
|
|||
|
"on 1768\n",
|
|||
|
"had 1628\n",
|
|||
|
"as 1589\n",
|
|||
|
"with 1524\n",
|
|||
|
"by 1519\n",
|
|||
|
"at 1463\n",
|
|||
|
"were 1435\n",
|
|||
|
"it 1362\n",
|
|||
|
"which 1305\n",
|
|||
|
"be 1135\n",
|
|||
|
"from 1024\n",
|
|||
|
"not 1014\n",
|
|||
|
"this 992\n",
|
|||
|
"is 937\n",
|
|||
|
"or 932\n",
|
|||
|
"but 874\n",
|
|||
|
"one 782\n",
|
|||
|
"have 780\n",
|
|||
|
"oswald 776\n",
|
|||
|
"... ...\n",
|
|||
|
"eighteen: 1\n",
|
|||
|
"lading 1\n",
|
|||
|
"sustain 1\n",
|
|||
|
"inflict, 1\n",
|
|||
|
"markets, 1\n",
|
|||
|
"blow. 1\n",
|
|||
|
"ill-health 1\n",
|
|||
|
"delirium 1\n",
|
|||
|
"tremens, 1\n",
|
|||
|
"dejection, 1\n",
|
|||
|
"sacking 1\n",
|
|||
|
"prize-fighter 1\n",
|
|||
|
"scandalized 1\n",
|
|||
|
"outshone 1\n",
|
|||
|
"ferdinand 1\n",
|
|||
|
"grain. 1\n",
|
|||
|
"fluctuations 1\n",
|
|||
|
"attempt\" 1\n",
|
|||
|
"action; 1\n",
|
|||
|
"grains, 1\n",
|
|||
|
"prices, 1\n",
|
|||
|
"protectionists 1\n",
|
|||
|
"depress 1\n",
|
|||
|
"market, 1\n",
|
|||
|
"election; 1\n",
|
|||
|
"napoleon 1\n",
|
|||
|
"french, 1\n",
|
|||
|
"popularity 1\n",
|
|||
|
"\"convulsive 1\n",
|
|||
|
"lessening 1\n",
|
|||
|
"\n",
|
|||
|
"[22943 rows x 1 columns]"
|
|||
|
]
|
|||
|
},
|
|||
|
"execution_count": 18,
|
|||
|
"metadata": {},
|
|||
|
"output_type": "execute_result"
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"w_count_df"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 25,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"text/plain": [
|
|||
|
"11"
|
|||
|
]
|
|||
|
},
|
|||
|
"execution_count": 25,
|
|||
|
"metadata": {},
|
|||
|
"output_type": "execute_result"
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"# check a certain word\n",
|
|||
|
"w_count_df.at['minute', 0]"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": null,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [],
|
|||
|
"source": [
|
|||
|
"# fequency bar plot - it takes time!!\n",
|
|||
|
"w_count_df.plot.bar()"
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"metadata": {
|
|||
|
"kernelspec": {
|
|||
|
"display_name": "Python 3",
|
|||
|
"language": "python",
|
|||
|
"name": "python3"
|
|||
|
},
|
|||
|
"language_info": {
|
|||
|
"codemirror_mode": {
|
|||
|
"name": "ipython",
|
|||
|
"version": 3
|
|||
|
},
|
|||
|
"file_extension": ".py",
|
|||
|
"mimetype": "text/x-python",
|
|||
|
"name": "python",
|
|||
|
"nbconvert_exporter": "python",
|
|||
|
"pygments_lexer": "ipython3",
|
|||
|
"version": "3.6.5"
|
|||
|
}
|
|||
|
},
|
|||
|
"nbformat": 4,
|
|||
|
"nbformat_minor": 2
|
|||
|
}
|