{ "cells": [ { "cell_type": "markdown", "source": [ "# Analyze data" ], "metadata": { "collapsed": false, "pycharm": { "name": "#%% md\n" } } }, { "cell_type": "code", "execution_count": 1, "metadata": { "collapsed": true }, "outputs": [], "source": [ "import os\n", "\n", "import pandas as pd\n", "from tabulate import tabulate\n", "from tqdm import tqdm\n", "\n", "tqdm.pandas()\n" ] }, { "cell_type": "markdown", "source": [ "## Load raw data" ], "metadata": { "collapsed": false, "pycharm": { "name": "#%% md\n" } } }, { "cell_type": "code", "execution_count": 2, "outputs": [], "source": [ "TRAIN_DEV_PATH = os.path.join('../data/eam2021-train-set/bq-results-20210825-203004-swh711l21gv2.csv')\n", "TEST_PATH = os.path.join('../data/eam2021-test-set-public/eam2021-test-set-public.csv')\n", "assert os.path.isfile(TRAIN_DEV_PATH)\n", "assert os.path.isfile(TEST_PATH)" ], "metadata": { "collapsed": false, "pycharm": { "name": "#%%\n" } } }, { "cell_type": "code", "execution_count": 3, "outputs": [], "source": [ "train_dev_df = pd.read_csv(TRAIN_DEV_PATH, encoding='utf-8')\n", "test_df = pd.read_csv(TEST_PATH, encoding='utf-8')" ], "metadata": { "collapsed": false, "pycharm": { "name": "#%%\n" } } }, { "cell_type": "markdown", "source": [ "## Peek at the raw data frames" ], "metadata": { "collapsed": false, "pycharm": { "name": "#%% md\n" } } }, { "cell_type": "code", "execution_count": 4, "outputs": [ { "data": { "text/plain": " language post_index commentText \\\n0 Hindi 238566 शायद योगी जी है \n1 Hindi 7009 Tingri h to putri tu.. . \n2 Hindi 404648 Saale Tu kon sa pagal Nahi h . Teri comment pa... \n3 Hindi 5057 girl 😘😘 aaj ke baad msg ki to maar daluggi \n4 Hindi 107146 Free fire pubg ka baap ha kutta sala kamina \n\n report_count_comment report_count_post like_count_comment \\\n0 0 0 1 \n1 0 0 0 \n2 0 0 0 \n3 0 0 0 \n4 0 0 0 \n\n like_count_post label val \n0 1 0 0 \n1 0 0 0 \n2 0 1 0 \n3 0 0 0 \n4 0 1 0 ", "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
languagepost_indexcommentTextreport_count_commentreport_count_postlike_count_commentlike_count_postlabelval
0Hindi238566शायद योगी जी है001100
1Hindi7009Tingri h to putri tu.. .000000
2Hindi404648Saale Tu kon sa pagal Nahi h . Teri comment pa...000010
3Hindi5057girl 😘😘 aaj ke baad msg ki to maar daluggi000000
4Hindi107146Free fire pubg ka baap ha kutta sala kamina000010
\n
" }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "train_dev_df.head()" ], "metadata": { "collapsed": false, "pycharm": { "name": "#%%\n" } } }, { "cell_type": "code", "execution_count": 5, "outputs": [ { "data": { "text/plain": "0 665042\nName: val, dtype: int64" }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "train_dev_df['val'].value_counts()" ], "metadata": { "collapsed": false, "pycharm": { "name": "#%%\n" } } }, { "cell_type": "code", "execution_count": 6, "outputs": [ { "data": { "text/plain": " Id language post_index commentText \\\n0 2 Bengali 182442 Bichna theke agun berochhe re tar modhhyeu ami... \n1 3 Hindi 406921 JYOTI💏 chut ka pani \n2 4 Hindi 233255 Kuth tik n h \n3 5 Telugu 219308 Ekkada ap valaki kallu guddi ah?😠sarigga chuda... \n4 6 Hindi 269812 pagal khi ke gadhe me dha aata h bachho se kuc... \n\n report_count_comment report_count_post like_count_comment \\\n0 0 0 0 \n1 0 0 0 \n2 0 0 0 \n3 0 0 0 \n4 0 0 0 \n\n like_count_post \n0 0 \n1 0 \n2 0 \n3 0 \n4 0 ", "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
Idlanguagepost_indexcommentTextreport_count_commentreport_count_postlike_count_commentlike_count_post
02Bengali182442Bichna theke agun berochhe re tar modhhyeu ami...0000
13Hindi406921JYOTI💏 chut ka pani0000
24Hindi233255Kuth tik n h0000
35Telugu219308Ekkada ap valaki kallu guddi ah?😠sarigga chuda...0000
46Hindi269812pagal khi ke gadhe me dha aata h bachho se kuc...0000
\n
" }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "test_df.head()" ], "metadata": { "collapsed": false, "pycharm": { "name": "#%%\n" } } }, { "cell_type": "markdown", "source": [ "## Compute frequencies of labels and languages of raw data" ], "metadata": { "collapsed": false, "pycharm": { "name": "#%% md\n" } } }, { "cell_type": "code", "execution_count": 7, "outputs": [], "source": [ "def tabulify(df: pd.DataFrame, col_name: str) -> None:\n", " \"\"\"Tabulate the frequencies with percentages and display the total\"\"\"\n", " freqs_dict = df[col_name].value_counts(normalize=False).to_dict()\n", " fracs_dict = df[col_name].value_counts(normalize=True).to_dict()\n", " names = [k for k, v in freqs_dict.items()]\n", " freqs = [freqs_dict[k] for k in names]\n", " fracs = [fracs_dict[k] for k in names]\n", " percentages = [f'{100.0 * frac:0.1f}%' for frac in fracs]\n", " print(tabulate(\n", " {col_name: names + ['---Total---'], 'frequency': freqs + [sum(freqs_dict.values())],\n", " 'percentage': percentages + ['100.0%']},\n", " headers='keys',\n", " ))" ], "metadata": { "collapsed": false, "pycharm": { "name": "#%%\n" } } }, { "cell_type": "code", "execution_count": 8, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "\n", "*** TRAIN DEV LABEL FREQUENCIES ***\n", "\n", "╒═════════════╤═════════════╤══════════════╕\n", "│ label │ frequency │ percentage │\n", "╞═════════════╪═════════════╪══════════════╡\n", "│ 0 │ 352386 │ 53.0% │\n", "├─────────────┼─────────────┼──────────────┤\n", "│ 1 │ 312656 │ 47.0% │\n", "├─────────────┼─────────────┼──────────────┤\n", "│ ---Total--- │ 665042 │ 100.0% │\n", "╘═════════════╧═════════════╧══════════════╛\n" ] } ], "source": [ "print(f'\\n\\n*** TRAIN DEV LABEL FREQUENCIES ***\\n')\n", "tabulify(df=train_dev_df, col_name='label')" ], "metadata": { "collapsed": false, "pycharm": { "name": "#%%\n" } } }, { "cell_type": "code", "execution_count": 9, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "\n", "*** TRAIN DEV LANGUAGE FREQUENCIES ***\n", "\n", "╒═════════════╤═════════════╤══════════════╕\n", "│ language │ frequency │ percentage │\n", "╞═════════════╪═════════════╪══════════════╡\n", "│ Hindi │ 307180 │ 46.2% │\n", "├─────────────┼─────────────┼──────────────┤\n", "│ Telugu │ 97012 │ 14.6% │\n", "├─────────────┼─────────────┼──────────────┤\n", "│ Marathi │ 72044 │ 10.8% │\n", "├─────────────┼─────────────┼──────────────┤\n", "│ Tamil │ 69497 │ 10.5% │\n", "├─────────────┼─────────────┼──────────────┤\n", "│ Malayalam │ 40965 │ 6.2% │\n", "├─────────────┼─────────────┼──────────────┤\n", "│ Bengali │ 22835 │ 3.4% │\n", "├─────────────┼─────────────┼──────────────┤\n", "│ Kannada │ 13943 │ 2.1% │\n", "├─────────────┼─────────────┼──────────────┤\n", "│ Odia │ 10974 │ 1.7% │\n", "├─────────────┼─────────────┼──────────────┤\n", "│ Gujarati │ 8828 │ 1.3% │\n", "├─────────────┼─────────────┼──────────────┤\n", "│ Haryanvi │ 8812 │ 1.3% │\n", "├─────────────┼─────────────┼──────────────┤\n", "│ Bhojpuri │ 5804 │ 0.9% │\n", "├─────────────┼─────────────┼──────────────┤\n", "│ Rajasthani │ 4368 │ 0.7% │\n", "├─────────────┼─────────────┼──────────────┤\n", "│ Assamese │ 2780 │ 0.4% │\n", "├─────────────┼─────────────┼──────────────┤\n", "│ ---Total--- │ 665042 │ 100.0% │\n", "╘═════════════╧═════════════╧══════════════╛\n" ] } ], "source": [ "print(f'\\n\\n*** TRAIN DEV LANGUAGE FREQUENCIES ***\\n')\n", "tabulify(df=train_dev_df, col_name='language')" ], "metadata": { "collapsed": false, "pycharm": { "name": "#%%\n" } } }, { "cell_type": "code", "execution_count": 10, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "100%|██████████| 665042/665042 [00:09<00:00, 68177.01it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "\n", "*** TRAIN DEV LANGUAGE-LABEL PAIR FREQUENCIES ***\n", "\n", "╒══════════════════════╤═════════════╤══════════════╕\n", "│ language_and_label │ frequency │ percentage │\n", "╞══════════════════════╪═════════════╪══════════════╡\n", "│ ('Hindi', 1) │ 153747 │ 23.1% │\n", "├──────────────────────┼─────────────┼──────────────┤\n", "│ ('Hindi', 0) │ 153433 │ 23.1% │\n", "├──────────────────────┼─────────────┼──────────────┤\n", "│ ('Telugu', 1) │ 48551 │ 7.3% │\n", "├──────────────────────┼─────────────┼──────────────┤\n", "│ ('Telugu', 0) │ 48461 │ 7.3% │\n", "├──────────────────────┼─────────────┼──────────────┤\n", "│ ('Marathi', 0) │ 44677 │ 6.7% │\n", "├──────────────────────┼─────────────┼──────────────┤\n", "│ ('Tamil', 0) │ 34792 │ 5.2% │\n", "├──────────────────────┼─────────────┼──────────────┤\n", "│ ('Tamil', 1) │ 34705 │ 5.2% │\n", "├──────────────────────┼─────────────┼──────────────┤\n", "│ ('Malayalam', 0) │ 31749 │ 4.8% │\n", "├──────────────────────┼─────────────┼──────────────┤\n", "│ ('Marathi', 1) │ 27367 │ 4.1% │\n", "├──────────────────────┼─────────────┼──────────────┤\n", "│ ('Bengali', 0) │ 11428 │ 1.7% │\n", "├──────────────────────┼─────────────┼──────────────┤\n", "│ ('Bengali', 1) │ 11407 │ 1.7% │\n", "├──────────────────────┼─────────────┼──────────────┤\n", "│ ('Malayalam', 1) │ 9216 │ 1.4% │\n", "├──────────────────────┼─────────────┼──────────────┤\n", "│ ('Kannada', 1) │ 6989 │ 1.1% │\n", "├──────────────────────┼─────────────┼──────────────┤\n", "│ ('Kannada', 0) │ 6954 │ 1.0% │\n", "├──────────────────────┼─────────────┼──────────────┤\n", "│ ('Odia', 1) │ 5499 │ 0.8% │\n", "├──────────────────────┼─────────────┼──────────────┤\n", "│ ('Odia', 0) │ 5475 │ 0.8% │\n", "├──────────────────────┼─────────────┼──────────────┤\n", "│ ('Gujarati', 0) │ 4426 │ 0.7% │\n", "├──────────────────────┼─────────────┼──────────────┤\n", "│ ('Haryanvi', 1) │ 4417 │ 0.7% │\n", "├──────────────────────┼─────────────┼──────────────┤\n", "│ ('Gujarati', 1) │ 4402 │ 0.7% │\n", "├──────────────────────┼─────────────┼──────────────┤\n", "│ ('Haryanvi', 0) │ 4395 │ 0.7% │\n", "├──────────────────────┼─────────────┼──────────────┤\n", "│ ('Bhojpuri', 0) │ 2917 │ 0.4% │\n", "├──────────────────────┼─────────────┼──────────────┤\n", "│ ('Bhojpuri', 1) │ 2887 │ 0.4% │\n", "├──────────────────────┼─────────────┼──────────────┤\n", "│ ('Rajasthani', 1) │ 2185 │ 0.3% │\n", "├──────────────────────┼─────────────┼──────────────┤\n", "│ ('Rajasthani', 0) │ 2183 │ 0.3% │\n", "├──────────────────────┼─────────────┼──────────────┤\n", "│ ('Assamese', 0) │ 1496 │ 0.2% │\n", "├──────────────────────┼─────────────┼──────────────┤\n", "│ ('Assamese', 1) │ 1284 │ 0.2% │\n", "├──────────────────────┼─────────────┼──────────────┤\n", "│ ---Total--- │ 665042 │ 100.0% │\n", "╘══════════════════════╧═════════════╧══════════════╛\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\n" ] } ], "source": [ "train_dev_df['language_and_label'] = train_dev_df.progress_apply(lambda x: (x['language'], x['label']), axis=1)\n", "print(f'\\n\\n*** TRAIN DEV LANGUAGE-LABEL PAIR FREQUENCIES ***\\n')\n", "tabulify(df=train_dev_df, col_name='language_and_label')" ], "metadata": { "collapsed": false, "pycharm": { "name": "#%%\n" } } }, { "cell_type": "code", "execution_count": 11, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "\n", "*** TEST LANGUAGE FREQUENCIES ***\n", "\n", "╒═════════════╤═════════════╤══════════════╕\n", "│ language │ frequency │ percentage │\n", "╞═════════════╪═════════════╪══════════════╡\n", "│ Hindi │ 34313 │ 46.2% │\n", "├─────────────┼─────────────┼──────────────┤\n", "│ Telugu │ 10877 │ 14.6% │\n", "├─────────────┼─────────────┼──────────────┤\n", "│ Marathi │ 8057 │ 10.9% │\n", "├─────────────┼─────────────┼──────────────┤\n", "│ Tamil │ 7864 │ 10.6% │\n", "├─────────────┼─────────────┼──────────────┤\n", "│ Malayalam │ 4465 │ 6.0% │\n", "├─────────────┼─────────────┼──────────────┤\n", "│ Bengali │ 2534 │ 3.4% │\n", "├─────────────┼─────────────┼──────────────┤\n", "│ Kannada │ 1592 │ 2.1% │\n", "├─────────────┼─────────────┼──────────────┤\n", "│ Odia │ 1131 │ 1.5% │\n", "├─────────────┼─────────────┼──────────────┤\n", "│ Haryanvi │ 1025 │ 1.4% │\n", "├─────────────┼─────────────┼──────────────┤\n", "│ Gujarati │ 949 │ 1.3% │\n", "├─────────────┼─────────────┼──────────────┤\n", "│ Bhojpuri │ 663 │ 0.9% │\n", "├─────────────┼─────────────┼──────────────┤\n", "│ Rajasthani │ 475 │ 0.6% │\n", "├─────────────┼─────────────┼──────────────┤\n", "│ Assamese │ 308 │ 0.4% │\n", "├─────────────┼─────────────┼──────────────┤\n", "│ ---Total--- │ 74253 │ 100.0% │\n", "╘═════════════╧═════════════╧══════════════╛\n" ] } ], "source": [ "print(f'\\n\\n*** TEST LANGUAGE FREQUENCIES ***\\n')\n", "tabulify(df=test_df, col_name='language')" ], "metadata": { "collapsed": false, "pycharm": { "name": "#%%\n" } } }, { "cell_type": "markdown", "source": [ "## Prepared data" ], "metadata": { "collapsed": false, "pycharm": { "name": "#%% md\n" } } }, { "cell_type": "markdown", "source": [ "### Load prepared data" ], "metadata": { "collapsed": false, "pycharm": { "name": "#%% md\n" } } }, { "cell_type": "code", "execution_count": 12, "outputs": [], "source": [ "PREPARED_TRAIN_PATH = os.path.join('../data/prepared/train.csv')\n", "PREPARED_DEV_PATH = os.path.join('../data/prepared/dev.csv')\n", "PREPARED_TEST_PATH = os.path.join('../data/prepared/test.csv')\n", "assert os.path.isfile(PREPARED_TRAIN_PATH)\n", "assert os.path.isfile(PREPARED_DEV_PATH)\n", "assert os.path.isfile(PREPARED_TEST_PATH)" ], "metadata": { "collapsed": false, "pycharm": { "name": "#%%\n" } } }, { "cell_type": "code", "execution_count": 13, "outputs": [], "source": [ "prepared_train_df = pd.read_csv(PREPARED_TRAIN_PATH, encoding='utf-8')\n", "prepared_dev_df = pd.read_csv(PREPARED_DEV_PATH, encoding='utf-8')\n", "prepared_test_df = pd.read_csv(PREPARED_TEST_PATH, encoding='utf-8')\n", "assert test_df.equals(prepared_test_df)" ], "metadata": { "collapsed": false, "pycharm": { "name": "#%%\n" } } }, { "cell_type": "markdown", "source": [ "### Compute frequencies of labels and languages of prepared data" ], "metadata": { "collapsed": false, "pycharm": { "name": "#%% md\n" } } }, { "cell_type": "code", "execution_count": 14, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "\n", "*** TRAIN LABEL FREQUENCIES ***\n", "\n", "╒═════════════╤═════════════╤══════════════╕\n", "│ label │ frequency │ percentage │\n", "╞═════════════╪═════════════╪══════════════╡\n", "│ 0 │ 334765 │ 53.0% │\n", "├─────────────┼─────────────┼──────────────┤\n", "│ 1 │ 297024 │ 47.0% │\n", "├─────────────┼─────────────┼──────────────┤\n", "│ ---Total--- │ 631789 │ 100.0% │\n", "╘═════════════╧═════════════╧══════════════╛\n", "\n", "\n", "*** DEV LABEL FREQUENCIES ***\n", "\n", "╒═════════════╤═════════════╤══════════════╕\n", "│ label │ frequency │ percentage │\n", "╞═════════════╪═════════════╪══════════════╡\n", "│ 0 │ 17621 │ 53.0% │\n", "├─────────────┼─────────────┼──────────────┤\n", "│ 1 │ 15632 │ 47.0% │\n", "├─────────────┼─────────────┼──────────────┤\n", "│ ---Total--- │ 33253 │ 100.0% │\n", "╘═════════════╧═════════════╧══════════════╛\n" ] } ], "source": [ "\n", "print(f'\\n\\n*** TRAIN LABEL FREQUENCIES ***\\n')\n", "tabulify(df=prepared_train_df, col_name='label')\n", "\n", "print(f'\\n\\n*** DEV LABEL FREQUENCIES ***\\n')\n", "tabulify(df=prepared_dev_df, col_name='label')" ], "metadata": { "collapsed": false, "pycharm": { "name": "#%%\n" } } }, { "cell_type": "code", "execution_count": 15, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "\n", "*** TRAIN LANGUAGE FREQUENCIES ***\n", "\n", "╒═════════════╤═════════════╤══════════════╕\n", "│ language │ frequency │ percentage │\n", "╞═════════════╪═════════════╪══════════════╡\n", "│ Hindi │ 291820 │ 46.2% │\n", "├─────────────┼─────────────┼──────────────┤\n", "│ Telugu │ 92161 │ 14.6% │\n", "├─────────────┼─────────────┼──────────────┤\n", "│ Marathi │ 68442 │ 10.8% │\n", "├─────────────┼─────────────┼──────────────┤\n", "│ Tamil │ 66022 │ 10.5% │\n", "├─────────────┼─────────────┼──────────────┤\n", "│ Malayalam │ 38916 │ 6.2% │\n", "├─────────────┼─────────────┼──────────────┤\n", "│ Bengali │ 21694 │ 3.4% │\n", "├─────────────┼─────────────┼──────────────┤\n", "│ Kannada │ 13246 │ 2.1% │\n", "├─────────────┼─────────────┼──────────────┤\n", "│ Odia │ 10425 │ 1.7% │\n", "├─────────────┼─────────────┼──────────────┤\n", "│ Gujarati │ 8387 │ 1.3% │\n", "├─────────────┼─────────────┼──────────────┤\n", "│ Haryanvi │ 8371 │ 1.3% │\n", "├─────────────┼─────────────┼──────────────┤\n", "│ Bhojpuri │ 5514 │ 0.9% │\n", "├─────────────┼─────────────┼──────────────┤\n", "│ Rajasthani │ 4150 │ 0.7% │\n", "├─────────────┼─────────────┼──────────────┤\n", "│ Assamese │ 2641 │ 0.4% │\n", "├─────────────┼─────────────┼──────────────┤\n", "│ ---Total--- │ 631789 │ 100.0% │\n", "╘═════════════╧═════════════╧══════════════╛\n", "\n", "\n", "*** DEV LANGUAGE FREQUENCIES ***\n", "\n", "╒═════════════╤═════════════╤══════════════╕\n", "│ language │ frequency │ percentage │\n", "╞═════════════╪═════════════╪══════════════╡\n", "│ Hindi │ 15360 │ 46.2% │\n", "├─────────────┼─────────────┼──────────────┤\n", "│ Telugu │ 4851 │ 14.6% │\n", "├─────────────┼─────────────┼──────────────┤\n", "│ Marathi │ 3602 │ 10.8% │\n", "├─────────────┼─────────────┼──────────────┤\n", "│ Tamil │ 3475 │ 10.5% │\n", "├─────────────┼─────────────┼──────────────┤\n", "│ Malayalam │ 2049 │ 6.2% │\n", "├─────────────┼─────────────┼──────────────┤\n", "│ Bengali │ 1141 │ 3.4% │\n", "├─────────────┼─────────────┼──────────────┤\n", "│ Kannada │ 697 │ 2.1% │\n", "├─────────────┼─────────────┼──────────────┤\n", "│ Odia │ 549 │ 1.7% │\n", "├─────────────┼─────────────┼──────────────┤\n", "│ Gujarati │ 441 │ 1.3% │\n", "├─────────────┼─────────────┼──────────────┤\n", "│ Haryanvi │ 441 │ 1.3% │\n", "├─────────────┼─────────────┼──────────────┤\n", "│ Bhojpuri │ 290 │ 0.9% │\n", "├─────────────┼─────────────┼──────────────┤\n", "│ Rajasthani │ 218 │ 0.7% │\n", "├─────────────┼─────────────┼──────────────┤\n", "│ Assamese │ 139 │ 0.4% │\n", "├─────────────┼─────────────┼──────────────┤\n", "│ ---Total--- │ 33253 │ 100.0% │\n", "╘═════════════╧═════════════╧══════════════╛\n" ] } ], "source": [ "print(f'\\n\\n*** TRAIN LANGUAGE FREQUENCIES ***\\n')\n", "tabulify(df=prepared_train_df, col_name='language')\n", "\n", "print(f'\\n\\n*** DEV LANGUAGE FREQUENCIES ***\\n')\n", "tabulify(df=prepared_dev_df, col_name='language')" ], "metadata": { "collapsed": false, "pycharm": { "name": "#%%\n" } } }, { "cell_type": "code", "execution_count": 16, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "\n", "*** TRAIN LANGUAGE-LABEL PAIR FREQUENCIES ***\n", "\n", "╒══════════════════════╤═════════════╤══════════════╕\n", "│ language_and_label │ frequency │ percentage │\n", "╞══════════════════════╪═════════════╪══════════════╡\n", "│ ('Hindi', 1) │ 146059 │ 23.1% │\n", "├──────────────────────┼─────────────┼──────────────┤\n", "│ ('Hindi', 0) │ 145761 │ 23.1% │\n", "├──────────────────────┼─────────────┼──────────────┤\n", "│ ('Telugu', 1) │ 46123 │ 7.3% │\n", "├──────────────────────┼─────────────┼──────────────┤\n", "│ ('Telugu', 0) │ 46038 │ 7.3% │\n", "├──────────────────────┼─────────────┼──────────────┤\n", "│ ('Marathi', 0) │ 42443 │ 6.7% │\n", "├──────────────────────┼─────────────┼──────────────┤\n", "│ ('Tamil', 0) │ 33052 │ 5.2% │\n", "├──────────────────────┼─────────────┼──────────────┤\n", "│ ('Tamil', 1) │ 32970 │ 5.2% │\n", "├──────────────────────┼─────────────┼──────────────┤\n", "│ ('Malayalam', 0) │ 30161 │ 4.8% │\n", "├──────────────────────┼─────────────┼──────────────┤\n", "│ ('Marathi', 1) │ 25999 │ 4.1% │\n", "├──────────────────────┼─────────────┼──────────────┤\n", "│ ('Bengali', 0) │ 10857 │ 1.7% │\n", "├──────────────────────┼─────────────┼──────────────┤\n", "│ ('Bengali', 1) │ 10837 │ 1.7% │\n", "├──────────────────────┼─────────────┼──────────────┤\n", "│ ('Malayalam', 1) │ 8755 │ 1.4% │\n", "├──────────────────────┼─────────────┼──────────────┤\n", "│ ('Kannada', 1) │ 6640 │ 1.1% │\n", "├──────────────────────┼─────────────┼──────────────┤\n", "│ ('Kannada', 0) │ 6606 │ 1.0% │\n", "├──────────────────────┼─────────────┼──────────────┤\n", "│ ('Odia', 1) │ 5224 │ 0.8% │\n", "├──────────────────────┼─────────────┼──────────────┤\n", "│ ('Odia', 0) │ 5201 │ 0.8% │\n", "├──────────────────────┼─────────────┼──────────────┤\n", "│ ('Gujarati', 0) │ 4205 │ 0.7% │\n", "├──────────────────────┼─────────────┼──────────────┤\n", "│ ('Haryanvi', 1) │ 4196 │ 0.7% │\n", "├──────────────────────┼─────────────┼──────────────┤\n", "│ ('Gujarati', 1) │ 4182 │ 0.7% │\n", "├──────────────────────┼─────────────┼──────────────┤\n", "│ ('Haryanvi', 0) │ 4175 │ 0.7% │\n", "├──────────────────────┼─────────────┼──────────────┤\n", "│ ('Bhojpuri', 0) │ 2771 │ 0.4% │\n", "├──────────────────────┼─────────────┼──────────────┤\n", "│ ('Bhojpuri', 1) │ 2743 │ 0.4% │\n", "├──────────────────────┼─────────────┼──────────────┤\n", "│ ('Rajasthani', 1) │ 2076 │ 0.3% │\n", "├──────────────────────┼─────────────┼──────────────┤\n", "│ ('Rajasthani', 0) │ 2074 │ 0.3% │\n", "├──────────────────────┼─────────────┼──────────────┤\n", "│ ('Assamese', 0) │ 1421 │ 0.2% │\n", "├──────────────────────┼─────────────┼──────────────┤\n", "│ ('Assamese', 1) │ 1220 │ 0.2% │\n", "├──────────────────────┼─────────────┼──────────────┤\n", "│ ---Total--- │ 631789 │ 100.0% │\n", "╘══════════════════════╧═════════════╧══════════════╛\n", "\n", "\n", "*** DEV LANGUAGE-LABEL PAIR FREQUENCIES ***\n", "\n", "╒══════════════════════╤═════════════╤══════════════╕\n", "│ language_and_label │ frequency │ percentage │\n", "╞══════════════════════╪═════════════╪══════════════╡\n", "│ ('Hindi', 1) │ 7688 │ 23.1% │\n", "├──────────────────────┼─────────────┼──────────────┤\n", "│ ('Hindi', 0) │ 7672 │ 23.1% │\n", "├──────────────────────┼─────────────┼──────────────┤\n", "│ ('Telugu', 1) │ 2428 │ 7.3% │\n", "├──────────────────────┼─────────────┼──────────────┤\n", "│ ('Telugu', 0) │ 2423 │ 7.3% │\n", "├──────────────────────┼─────────────┼──────────────┤\n", "│ ('Marathi', 0) │ 2234 │ 6.7% │\n", "├──────────────────────┼─────────────┼──────────────┤\n", "│ ('Tamil', 0) │ 1740 │ 5.2% │\n", "├──────────────────────┼─────────────┼──────────────┤\n", "│ ('Tamil', 1) │ 1735 │ 5.2% │\n", "├──────────────────────┼─────────────┼──────────────┤\n", "│ ('Malayalam', 0) │ 1588 │ 4.8% │\n", "├──────────────────────┼─────────────┼──────────────┤\n", "│ ('Marathi', 1) │ 1368 │ 4.1% │\n", "├──────────────────────┼─────────────┼──────────────┤\n", "│ ('Bengali', 0) │ 571 │ 1.7% │\n", "├──────────────────────┼─────────────┼──────────────┤\n", "│ ('Bengali', 1) │ 570 │ 1.7% │\n", "├──────────────────────┼─────────────┼──────────────┤\n", "│ ('Malayalam', 1) │ 461 │ 1.4% │\n", "├──────────────────────┼─────────────┼──────────────┤\n", "│ ('Kannada', 1) │ 349 │ 1.0% │\n", "├──────────────────────┼─────────────┼──────────────┤\n", "│ ('Kannada', 0) │ 348 │ 1.0% │\n", "├──────────────────────┼─────────────┼──────────────┤\n", "│ ('Odia', 1) │ 275 │ 0.8% │\n", "├──────────────────────┼─────────────┼──────────────┤\n", "│ ('Odia', 0) │ 274 │ 0.8% │\n", "├──────────────────────┼─────────────┼──────────────┤\n", "│ ('Haryanvi', 1) │ 221 │ 0.7% │\n", "├──────────────────────┼─────────────┼──────────────┤\n", "│ ('Gujarati', 0) │ 221 │ 0.7% │\n", "├──────────────────────┼─────────────┼──────────────┤\n", "│ ('Haryanvi', 0) │ 220 │ 0.7% │\n", "├──────────────────────┼─────────────┼──────────────┤\n", "│ ('Gujarati', 1) │ 220 │ 0.7% │\n", "├──────────────────────┼─────────────┼──────────────┤\n", "│ ('Bhojpuri', 0) │ 146 │ 0.4% │\n", "├──────────────────────┼─────────────┼──────────────┤\n", "│ ('Bhojpuri', 1) │ 144 │ 0.4% │\n", "├──────────────────────┼─────────────┼──────────────┤\n", "│ ('Rajasthani', 0) │ 109 │ 0.3% │\n", "├──────────────────────┼─────────────┼──────────────┤\n", "│ ('Rajasthani', 1) │ 109 │ 0.3% │\n", "├──────────────────────┼─────────────┼──────────────┤\n", "│ ('Assamese', 0) │ 75 │ 0.2% │\n", "├──────────────────────┼─────────────┼──────────────┤\n", "│ ('Assamese', 1) │ 64 │ 0.2% │\n", "├──────────────────────┼─────────────┼──────────────┤\n", "│ ---Total--- │ 33253 │ 100.0% │\n", "╘══════════════════════╧═════════════╧══════════════╛\n" ] } ], "source": [ "print(f'\\n\\n*** TRAIN LANGUAGE-LABEL PAIR FREQUENCIES ***\\n')\n", "tabulify(df=prepared_train_df, col_name='language_and_label')\n", "\n", "print(f'\\n\\n*** DEV LANGUAGE-LABEL PAIR FREQUENCIES ***\\n')\n", "tabulify(df=prepared_dev_df, col_name='language_and_label')" ], "metadata": { "collapsed": false, "pycharm": { "name": "#%%\n" } } }, { "cell_type": "code", "execution_count": 16, "outputs": [], "source": [], "metadata": { "collapsed": false, "pycharm": { "name": "#%%\n" } } } ], "metadata": { "kernelspec": { "name": "python3", "language": "python", "display_name": "Python 3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 2 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython2", "version": "2.7.6" } }, "nbformat": 4, "nbformat_minor": 0 }