{
"cells": [
{
"cell_type": "markdown",
"source": [
"# Analyze data"
],
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%% md\n"
}
}
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"import os\n",
"\n",
"import pandas as pd\n",
"from tabulate import tabulate\n",
"from tqdm import tqdm\n",
"\n",
"tqdm.pandas()\n"
]
},
{
"cell_type": "markdown",
"source": [
"## Load raw data"
],
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%% md\n"
}
}
},
{
"cell_type": "code",
"execution_count": 2,
"outputs": [],
"source": [
"TRAIN_DEV_PATH = os.path.join('../data/eam2021-train-set/bq-results-20210825-203004-swh711l21gv2.csv')\n",
"TEST_PATH = os.path.join('../data/eam2021-test-set-public/eam2021-test-set-public.csv')\n",
"assert os.path.isfile(TRAIN_DEV_PATH)\n",
"assert os.path.isfile(TEST_PATH)"
],
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n"
}
}
},
{
"cell_type": "code",
"execution_count": 3,
"outputs": [],
"source": [
"train_dev_df = pd.read_csv(TRAIN_DEV_PATH, encoding='utf-8')\n",
"test_df = pd.read_csv(TEST_PATH, encoding='utf-8')"
],
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n"
}
}
},
{
"cell_type": "markdown",
"source": [
"## Peek at the raw data frames"
],
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%% md\n"
}
}
},
{
"cell_type": "code",
"execution_count": 4,
"outputs": [
{
"data": {
"text/plain": " language post_index commentText \\\n0 Hindi 238566 शायद योगी जी है \n1 Hindi 7009 Tingri h to putri tu.. . \n2 Hindi 404648 Saale Tu kon sa pagal Nahi h . Teri comment pa... \n3 Hindi 5057 girl 😘😘 aaj ke baad msg ki to maar daluggi \n4 Hindi 107146 Free fire pubg ka baap ha kutta sala kamina \n\n report_count_comment report_count_post like_count_comment \\\n0 0 0 1 \n1 0 0 0 \n2 0 0 0 \n3 0 0 0 \n4 0 0 0 \n\n like_count_post label val \n0 1 0 0 \n1 0 0 0 \n2 0 1 0 \n3 0 0 0 \n4 0 1 0 ",
"text/html": "
\n\n
\n \n \n | \n language | \n post_index | \n commentText | \n report_count_comment | \n report_count_post | \n like_count_comment | \n like_count_post | \n label | \n val | \n
\n \n \n \n 0 | \n Hindi | \n 238566 | \n शायद योगी जी है | \n 0 | \n 0 | \n 1 | \n 1 | \n 0 | \n 0 | \n
\n \n 1 | \n Hindi | \n 7009 | \n Tingri h to putri tu.. . | \n 0 | \n 0 | \n 0 | \n 0 | \n 0 | \n 0 | \n
\n \n 2 | \n Hindi | \n 404648 | \n Saale Tu kon sa pagal Nahi h . Teri comment pa... | \n 0 | \n 0 | \n 0 | \n 0 | \n 1 | \n 0 | \n
\n \n 3 | \n Hindi | \n 5057 | \n girl 😘😘 aaj ke baad msg ki to maar daluggi | \n 0 | \n 0 | \n 0 | \n 0 | \n 0 | \n 0 | \n
\n \n 4 | \n Hindi | \n 107146 | \n Free fire pubg ka baap ha kutta sala kamina | \n 0 | \n 0 | \n 0 | \n 0 | \n 1 | \n 0 | \n
\n \n
\n
"
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"train_dev_df.head()"
],
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n"
}
}
},
{
"cell_type": "code",
"execution_count": 5,
"outputs": [
{
"data": {
"text/plain": "0 665042\nName: val, dtype: int64"
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"train_dev_df['val'].value_counts()"
],
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n"
}
}
},
{
"cell_type": "code",
"execution_count": 6,
"outputs": [
{
"data": {
"text/plain": " Id language post_index commentText \\\n0 2 Bengali 182442 Bichna theke agun berochhe re tar modhhyeu ami... \n1 3 Hindi 406921 JYOTI💏 chut ka pani \n2 4 Hindi 233255 Kuth tik n h \n3 5 Telugu 219308 Ekkada ap valaki kallu guddi ah?😠sarigga chuda... \n4 6 Hindi 269812 pagal khi ke gadhe me dha aata h bachho se kuc... \n\n report_count_comment report_count_post like_count_comment \\\n0 0 0 0 \n1 0 0 0 \n2 0 0 0 \n3 0 0 0 \n4 0 0 0 \n\n like_count_post \n0 0 \n1 0 \n2 0 \n3 0 \n4 0 ",
"text/html": "\n\n
\n \n \n | \n Id | \n language | \n post_index | \n commentText | \n report_count_comment | \n report_count_post | \n like_count_comment | \n like_count_post | \n
\n \n \n \n 0 | \n 2 | \n Bengali | \n 182442 | \n Bichna theke agun berochhe re tar modhhyeu ami... | \n 0 | \n 0 | \n 0 | \n 0 | \n
\n \n 1 | \n 3 | \n Hindi | \n 406921 | \n JYOTI💏 chut ka pani | \n 0 | \n 0 | \n 0 | \n 0 | \n
\n \n 2 | \n 4 | \n Hindi | \n 233255 | \n Kuth tik n h | \n 0 | \n 0 | \n 0 | \n 0 | \n
\n \n 3 | \n 5 | \n Telugu | \n 219308 | \n Ekkada ap valaki kallu guddi ah?😠sarigga chuda... | \n 0 | \n 0 | \n 0 | \n 0 | \n
\n \n 4 | \n 6 | \n Hindi | \n 269812 | \n pagal khi ke gadhe me dha aata h bachho se kuc... | \n 0 | \n 0 | \n 0 | \n 0 | \n
\n \n
\n
"
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"test_df.head()"
],
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n"
}
}
},
{
"cell_type": "markdown",
"source": [
"## Compute frequencies of labels and languages of raw data"
],
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%% md\n"
}
}
},
{
"cell_type": "code",
"execution_count": 7,
"outputs": [],
"source": [
"def tabulify(df: pd.DataFrame, col_name: str) -> None:\n",
" \"\"\"Tabulate the frequencies with percentages and display the total\"\"\"\n",
" freqs_dict = df[col_name].value_counts(normalize=False).to_dict()\n",
" fracs_dict = df[col_name].value_counts(normalize=True).to_dict()\n",
" names = [k for k, v in freqs_dict.items()]\n",
" freqs = [freqs_dict[k] for k in names]\n",
" fracs = [fracs_dict[k] for k in names]\n",
" percentages = [f'{100.0 * frac:0.1f}%' for frac in fracs]\n",
" print(tabulate(\n",
" {col_name: names + ['---Total---'], 'frequency': freqs + [sum(freqs_dict.values())],\n",
" 'percentage': percentages + ['100.0%']},\n",
" headers='keys',\n",
" ))"
],
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n"
}
}
},
{
"cell_type": "code",
"execution_count": 8,
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"\n",
"*** TRAIN DEV LABEL FREQUENCIES ***\n",
"\n",
"╒═════════════╤═════════════╤══════════════╕\n",
"│ label │ frequency │ percentage │\n",
"╞═════════════╪═════════════╪══════════════╡\n",
"│ 0 │ 352386 │ 53.0% │\n",
"├─────────────┼─────────────┼──────────────┤\n",
"│ 1 │ 312656 │ 47.0% │\n",
"├─────────────┼─────────────┼──────────────┤\n",
"│ ---Total--- │ 665042 │ 100.0% │\n",
"╘═════════════╧═════════════╧══════════════╛\n"
]
}
],
"source": [
"print(f'\\n\\n*** TRAIN DEV LABEL FREQUENCIES ***\\n')\n",
"tabulify(df=train_dev_df, col_name='label')"
],
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n"
}
}
},
{
"cell_type": "code",
"execution_count": 9,
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"\n",
"*** TRAIN DEV LANGUAGE FREQUENCIES ***\n",
"\n",
"╒═════════════╤═════════════╤══════════════╕\n",
"│ language │ frequency │ percentage │\n",
"╞═════════════╪═════════════╪══════════════╡\n",
"│ Hindi │ 307180 │ 46.2% │\n",
"├─────────────┼─────────────┼──────────────┤\n",
"│ Telugu │ 97012 │ 14.6% │\n",
"├─────────────┼─────────────┼──────────────┤\n",
"│ Marathi │ 72044 │ 10.8% │\n",
"├─────────────┼─────────────┼──────────────┤\n",
"│ Tamil │ 69497 │ 10.5% │\n",
"├─────────────┼─────────────┼──────────────┤\n",
"│ Malayalam │ 40965 │ 6.2% │\n",
"├─────────────┼─────────────┼──────────────┤\n",
"│ Bengali │ 22835 │ 3.4% │\n",
"├─────────────┼─────────────┼──────────────┤\n",
"│ Kannada │ 13943 │ 2.1% │\n",
"├─────────────┼─────────────┼──────────────┤\n",
"│ Odia │ 10974 │ 1.7% │\n",
"├─────────────┼─────────────┼──────────────┤\n",
"│ Gujarati │ 8828 │ 1.3% │\n",
"├─────────────┼─────────────┼──────────────┤\n",
"│ Haryanvi │ 8812 │ 1.3% │\n",
"├─────────────┼─────────────┼──────────────┤\n",
"│ Bhojpuri │ 5804 │ 0.9% │\n",
"├─────────────┼─────────────┼──────────────┤\n",
"│ Rajasthani │ 4368 │ 0.7% │\n",
"├─────────────┼─────────────┼──────────────┤\n",
"│ Assamese │ 2780 │ 0.4% │\n",
"├─────────────┼─────────────┼──────────────┤\n",
"│ ---Total--- │ 665042 │ 100.0% │\n",
"╘═════════════╧═════════════╧══════════════╛\n"
]
}
],
"source": [
"print(f'\\n\\n*** TRAIN DEV LANGUAGE FREQUENCIES ***\\n')\n",
"tabulify(df=train_dev_df, col_name='language')"
],
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n"
}
}
},
{
"cell_type": "code",
"execution_count": 10,
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"100%|██████████| 665042/665042 [00:09<00:00, 68177.01it/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"\n",
"*** TRAIN DEV LANGUAGE-LABEL PAIR FREQUENCIES ***\n",
"\n",
"╒══════════════════════╤═════════════╤══════════════╕\n",
"│ language_and_label │ frequency │ percentage │\n",
"╞══════════════════════╪═════════════╪══════════════╡\n",
"│ ('Hindi', 1) │ 153747 │ 23.1% │\n",
"├──────────────────────┼─────────────┼──────────────┤\n",
"│ ('Hindi', 0) │ 153433 │ 23.1% │\n",
"├──────────────────────┼─────────────┼──────────────┤\n",
"│ ('Telugu', 1) │ 48551 │ 7.3% │\n",
"├──────────────────────┼─────────────┼──────────────┤\n",
"│ ('Telugu', 0) │ 48461 │ 7.3% │\n",
"├──────────────────────┼─────────────┼──────────────┤\n",
"│ ('Marathi', 0) │ 44677 │ 6.7% │\n",
"├──────────────────────┼─────────────┼──────────────┤\n",
"│ ('Tamil', 0) │ 34792 │ 5.2% │\n",
"├──────────────────────┼─────────────┼──────────────┤\n",
"│ ('Tamil', 1) │ 34705 │ 5.2% │\n",
"├──────────────────────┼─────────────┼──────────────┤\n",
"│ ('Malayalam', 0) │ 31749 │ 4.8% │\n",
"├──────────────────────┼─────────────┼──────────────┤\n",
"│ ('Marathi', 1) │ 27367 │ 4.1% │\n",
"├──────────────────────┼─────────────┼──────────────┤\n",
"│ ('Bengali', 0) │ 11428 │ 1.7% │\n",
"├──────────────────────┼─────────────┼──────────────┤\n",
"│ ('Bengali', 1) │ 11407 │ 1.7% │\n",
"├──────────────────────┼─────────────┼──────────────┤\n",
"│ ('Malayalam', 1) │ 9216 │ 1.4% │\n",
"├──────────────────────┼─────────────┼──────────────┤\n",
"│ ('Kannada', 1) │ 6989 │ 1.1% │\n",
"├──────────────────────┼─────────────┼──────────────┤\n",
"│ ('Kannada', 0) │ 6954 │ 1.0% │\n",
"├──────────────────────┼─────────────┼──────────────┤\n",
"│ ('Odia', 1) │ 5499 │ 0.8% │\n",
"├──────────────────────┼─────────────┼──────────────┤\n",
"│ ('Odia', 0) │ 5475 │ 0.8% │\n",
"├──────────────────────┼─────────────┼──────────────┤\n",
"│ ('Gujarati', 0) │ 4426 │ 0.7% │\n",
"├──────────────────────┼─────────────┼──────────────┤\n",
"│ ('Haryanvi', 1) │ 4417 │ 0.7% │\n",
"├──────────────────────┼─────────────┼──────────────┤\n",
"│ ('Gujarati', 1) │ 4402 │ 0.7% │\n",
"├──────────────────────┼─────────────┼──────────────┤\n",
"│ ('Haryanvi', 0) │ 4395 │ 0.7% │\n",
"├──────────────────────┼─────────────┼──────────────┤\n",
"│ ('Bhojpuri', 0) │ 2917 │ 0.4% │\n",
"├──────────────────────┼─────────────┼──────────────┤\n",
"│ ('Bhojpuri', 1) │ 2887 │ 0.4% │\n",
"├──────────────────────┼─────────────┼──────────────┤\n",
"│ ('Rajasthani', 1) │ 2185 │ 0.3% │\n",
"├──────────────────────┼─────────────┼──────────────┤\n",
"│ ('Rajasthani', 0) │ 2183 │ 0.3% │\n",
"├──────────────────────┼─────────────┼──────────────┤\n",
"│ ('Assamese', 0) │ 1496 │ 0.2% │\n",
"├──────────────────────┼─────────────┼──────────────┤\n",
"│ ('Assamese', 1) │ 1284 │ 0.2% │\n",
"├──────────────────────┼─────────────┼──────────────┤\n",
"│ ---Total--- │ 665042 │ 100.0% │\n",
"╘══════════════════════╧═════════════╧══════════════╛\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"\n"
]
}
],
"source": [
"train_dev_df['language_and_label'] = train_dev_df.progress_apply(lambda x: (x['language'], x['label']), axis=1)\n",
"print(f'\\n\\n*** TRAIN DEV LANGUAGE-LABEL PAIR FREQUENCIES ***\\n')\n",
"tabulify(df=train_dev_df, col_name='language_and_label')"
],
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n"
}
}
},
{
"cell_type": "code",
"execution_count": 11,
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"\n",
"*** TEST LANGUAGE FREQUENCIES ***\n",
"\n",
"╒═════════════╤═════════════╤══════════════╕\n",
"│ language │ frequency │ percentage │\n",
"╞═════════════╪═════════════╪══════════════╡\n",
"│ Hindi │ 34313 │ 46.2% │\n",
"├─────────────┼─────────────┼──────────────┤\n",
"│ Telugu │ 10877 │ 14.6% │\n",
"├─────────────┼─────────────┼──────────────┤\n",
"│ Marathi │ 8057 │ 10.9% │\n",
"├─────────────┼─────────────┼──────────────┤\n",
"│ Tamil │ 7864 │ 10.6% │\n",
"├─────────────┼─────────────┼──────────────┤\n",
"│ Malayalam │ 4465 │ 6.0% │\n",
"├─────────────┼─────────────┼──────────────┤\n",
"│ Bengali │ 2534 │ 3.4% │\n",
"├─────────────┼─────────────┼──────────────┤\n",
"│ Kannada │ 1592 │ 2.1% │\n",
"├─────────────┼─────────────┼──────────────┤\n",
"│ Odia │ 1131 │ 1.5% │\n",
"├─────────────┼─────────────┼──────────────┤\n",
"│ Haryanvi │ 1025 │ 1.4% │\n",
"├─────────────┼─────────────┼──────────────┤\n",
"│ Gujarati │ 949 │ 1.3% │\n",
"├─────────────┼─────────────┼──────────────┤\n",
"│ Bhojpuri │ 663 │ 0.9% │\n",
"├─────────────┼─────────────┼──────────────┤\n",
"│ Rajasthani │ 475 │ 0.6% │\n",
"├─────────────┼─────────────┼──────────────┤\n",
"│ Assamese │ 308 │ 0.4% │\n",
"├─────────────┼─────────────┼──────────────┤\n",
"│ ---Total--- │ 74253 │ 100.0% │\n",
"╘═════════════╧═════════════╧══════════════╛\n"
]
}
],
"source": [
"print(f'\\n\\n*** TEST LANGUAGE FREQUENCIES ***\\n')\n",
"tabulify(df=test_df, col_name='language')"
],
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n"
}
}
},
{
"cell_type": "markdown",
"source": [
"## Prepared data"
],
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%% md\n"
}
}
},
{
"cell_type": "markdown",
"source": [
"### Load prepared data"
],
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%% md\n"
}
}
},
{
"cell_type": "code",
"execution_count": 12,
"outputs": [],
"source": [
"PREPARED_TRAIN_PATH = os.path.join('../data/prepared/train.csv')\n",
"PREPARED_DEV_PATH = os.path.join('../data/prepared/dev.csv')\n",
"PREPARED_TEST_PATH = os.path.join('../data/prepared/test.csv')\n",
"assert os.path.isfile(PREPARED_TRAIN_PATH)\n",
"assert os.path.isfile(PREPARED_DEV_PATH)\n",
"assert os.path.isfile(PREPARED_TEST_PATH)"
],
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n"
}
}
},
{
"cell_type": "code",
"execution_count": 13,
"outputs": [],
"source": [
"prepared_train_df = pd.read_csv(PREPARED_TRAIN_PATH, encoding='utf-8')\n",
"prepared_dev_df = pd.read_csv(PREPARED_DEV_PATH, encoding='utf-8')\n",
"prepared_test_df = pd.read_csv(PREPARED_TEST_PATH, encoding='utf-8')\n",
"assert test_df.equals(prepared_test_df)"
],
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n"
}
}
},
{
"cell_type": "markdown",
"source": [
"### Compute frequencies of labels and languages of prepared data"
],
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%% md\n"
}
}
},
{
"cell_type": "code",
"execution_count": 14,
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"\n",
"*** TRAIN LABEL FREQUENCIES ***\n",
"\n",
"╒═════════════╤═════════════╤══════════════╕\n",
"│ label │ frequency │ percentage │\n",
"╞═════════════╪═════════════╪══════════════╡\n",
"│ 0 │ 334765 │ 53.0% │\n",
"├─────────────┼─────────────┼──────────────┤\n",
"│ 1 │ 297024 │ 47.0% │\n",
"├─────────────┼─────────────┼──────────────┤\n",
"│ ---Total--- │ 631789 │ 100.0% │\n",
"╘═════════════╧═════════════╧══════════════╛\n",
"\n",
"\n",
"*** DEV LABEL FREQUENCIES ***\n",
"\n",
"╒═════════════╤═════════════╤══════════════╕\n",
"│ label │ frequency │ percentage │\n",
"╞═════════════╪═════════════╪══════════════╡\n",
"│ 0 │ 17621 │ 53.0% │\n",
"├─────────────┼─────────────┼──────────────┤\n",
"│ 1 │ 15632 │ 47.0% │\n",
"├─────────────┼─────────────┼──────────────┤\n",
"│ ---Total--- │ 33253 │ 100.0% │\n",
"╘═════════════╧═════════════╧══════════════╛\n"
]
}
],
"source": [
"\n",
"print(f'\\n\\n*** TRAIN LABEL FREQUENCIES ***\\n')\n",
"tabulify(df=prepared_train_df, col_name='label')\n",
"\n",
"print(f'\\n\\n*** DEV LABEL FREQUENCIES ***\\n')\n",
"tabulify(df=prepared_dev_df, col_name='label')"
],
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n"
}
}
},
{
"cell_type": "code",
"execution_count": 15,
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"\n",
"*** TRAIN LANGUAGE FREQUENCIES ***\n",
"\n",
"╒═════════════╤═════════════╤══════════════╕\n",
"│ language │ frequency │ percentage │\n",
"╞═════════════╪═════════════╪══════════════╡\n",
"│ Hindi │ 291820 │ 46.2% │\n",
"├─────────────┼─────────────┼──────────────┤\n",
"│ Telugu │ 92161 │ 14.6% │\n",
"├─────────────┼─────────────┼──────────────┤\n",
"│ Marathi │ 68442 │ 10.8% │\n",
"├─────────────┼─────────────┼──────────────┤\n",
"│ Tamil │ 66022 │ 10.5% │\n",
"├─────────────┼─────────────┼──────────────┤\n",
"│ Malayalam │ 38916 │ 6.2% │\n",
"├─────────────┼─────────────┼──────────────┤\n",
"│ Bengali │ 21694 │ 3.4% │\n",
"├─────────────┼─────────────┼──────────────┤\n",
"│ Kannada │ 13246 │ 2.1% │\n",
"├─────────────┼─────────────┼──────────────┤\n",
"│ Odia │ 10425 │ 1.7% │\n",
"├─────────────┼─────────────┼──────────────┤\n",
"│ Gujarati │ 8387 │ 1.3% │\n",
"├─────────────┼─────────────┼──────────────┤\n",
"│ Haryanvi │ 8371 │ 1.3% │\n",
"├─────────────┼─────────────┼──────────────┤\n",
"│ Bhojpuri │ 5514 │ 0.9% │\n",
"├─────────────┼─────────────┼──────────────┤\n",
"│ Rajasthani │ 4150 │ 0.7% │\n",
"├─────────────┼─────────────┼──────────────┤\n",
"│ Assamese │ 2641 │ 0.4% │\n",
"├─────────────┼─────────────┼──────────────┤\n",
"│ ---Total--- │ 631789 │ 100.0% │\n",
"╘═════════════╧═════════════╧══════════════╛\n",
"\n",
"\n",
"*** DEV LANGUAGE FREQUENCIES ***\n",
"\n",
"╒═════════════╤═════════════╤══════════════╕\n",
"│ language │ frequency │ percentage │\n",
"╞═════════════╪═════════════╪══════════════╡\n",
"│ Hindi │ 15360 │ 46.2% │\n",
"├─────────────┼─────────────┼──────────────┤\n",
"│ Telugu │ 4851 │ 14.6% │\n",
"├─────────────┼─────────────┼──────────────┤\n",
"│ Marathi │ 3602 │ 10.8% │\n",
"├─────────────┼─────────────┼──────────────┤\n",
"│ Tamil │ 3475 │ 10.5% │\n",
"├─────────────┼─────────────┼──────────────┤\n",
"│ Malayalam │ 2049 │ 6.2% │\n",
"├─────────────┼─────────────┼──────────────┤\n",
"│ Bengali │ 1141 │ 3.4% │\n",
"├─────────────┼─────────────┼──────────────┤\n",
"│ Kannada │ 697 │ 2.1% │\n",
"├─────────────┼─────────────┼──────────────┤\n",
"│ Odia │ 549 │ 1.7% │\n",
"├─────────────┼─────────────┼──────────────┤\n",
"│ Gujarati │ 441 │ 1.3% │\n",
"├─────────────┼─────────────┼──────────────┤\n",
"│ Haryanvi │ 441 │ 1.3% │\n",
"├─────────────┼─────────────┼──────────────┤\n",
"│ Bhojpuri │ 290 │ 0.9% │\n",
"├─────────────┼─────────────┼──────────────┤\n",
"│ Rajasthani │ 218 │ 0.7% │\n",
"├─────────────┼─────────────┼──────────────┤\n",
"│ Assamese │ 139 │ 0.4% │\n",
"├─────────────┼─────────────┼──────────────┤\n",
"│ ---Total--- │ 33253 │ 100.0% │\n",
"╘═════════════╧═════════════╧══════════════╛\n"
]
}
],
"source": [
"print(f'\\n\\n*** TRAIN LANGUAGE FREQUENCIES ***\\n')\n",
"tabulify(df=prepared_train_df, col_name='language')\n",
"\n",
"print(f'\\n\\n*** DEV LANGUAGE FREQUENCIES ***\\n')\n",
"tabulify(df=prepared_dev_df, col_name='language')"
],
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n"
}
}
},
{
"cell_type": "code",
"execution_count": 16,
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"\n",
"*** TRAIN LANGUAGE-LABEL PAIR FREQUENCIES ***\n",
"\n",
"╒══════════════════════╤═════════════╤══════════════╕\n",
"│ language_and_label │ frequency │ percentage │\n",
"╞══════════════════════╪═════════════╪══════════════╡\n",
"│ ('Hindi', 1) │ 146059 │ 23.1% │\n",
"├──────────────────────┼─────────────┼──────────────┤\n",
"│ ('Hindi', 0) │ 145761 │ 23.1% │\n",
"├──────────────────────┼─────────────┼──────────────┤\n",
"│ ('Telugu', 1) │ 46123 │ 7.3% │\n",
"├──────────────────────┼─────────────┼──────────────┤\n",
"│ ('Telugu', 0) │ 46038 │ 7.3% │\n",
"├──────────────────────┼─────────────┼──────────────┤\n",
"│ ('Marathi', 0) │ 42443 │ 6.7% │\n",
"├──────────────────────┼─────────────┼──────────────┤\n",
"│ ('Tamil', 0) │ 33052 │ 5.2% │\n",
"├──────────────────────┼─────────────┼──────────────┤\n",
"│ ('Tamil', 1) │ 32970 │ 5.2% │\n",
"├──────────────────────┼─────────────┼──────────────┤\n",
"│ ('Malayalam', 0) │ 30161 │ 4.8% │\n",
"├──────────────────────┼─────────────┼──────────────┤\n",
"│ ('Marathi', 1) │ 25999 │ 4.1% │\n",
"├──────────────────────┼─────────────┼──────────────┤\n",
"│ ('Bengali', 0) │ 10857 │ 1.7% │\n",
"├──────────────────────┼─────────────┼──────────────┤\n",
"│ ('Bengali', 1) │ 10837 │ 1.7% │\n",
"├──────────────────────┼─────────────┼──────────────┤\n",
"│ ('Malayalam', 1) │ 8755 │ 1.4% │\n",
"├──────────────────────┼─────────────┼──────────────┤\n",
"│ ('Kannada', 1) │ 6640 │ 1.1% │\n",
"├──────────────────────┼─────────────┼──────────────┤\n",
"│ ('Kannada', 0) │ 6606 │ 1.0% │\n",
"├──────────────────────┼─────────────┼──────────────┤\n",
"│ ('Odia', 1) │ 5224 │ 0.8% │\n",
"├──────────────────────┼─────────────┼──────────────┤\n",
"│ ('Odia', 0) │ 5201 │ 0.8% │\n",
"├──────────────────────┼─────────────┼──────────────┤\n",
"│ ('Gujarati', 0) │ 4205 │ 0.7% │\n",
"├──────────────────────┼─────────────┼──────────────┤\n",
"│ ('Haryanvi', 1) │ 4196 │ 0.7% │\n",
"├──────────────────────┼─────────────┼──────────────┤\n",
"│ ('Gujarati', 1) │ 4182 │ 0.7% │\n",
"├──────────────────────┼─────────────┼──────────────┤\n",
"│ ('Haryanvi', 0) │ 4175 │ 0.7% │\n",
"├──────────────────────┼─────────────┼──────────────┤\n",
"│ ('Bhojpuri', 0) │ 2771 │ 0.4% │\n",
"├──────────────────────┼─────────────┼──────────────┤\n",
"│ ('Bhojpuri', 1) │ 2743 │ 0.4% │\n",
"├──────────────────────┼─────────────┼──────────────┤\n",
"│ ('Rajasthani', 1) │ 2076 │ 0.3% │\n",
"├──────────────────────┼─────────────┼──────────────┤\n",
"│ ('Rajasthani', 0) │ 2074 │ 0.3% │\n",
"├──────────────────────┼─────────────┼──────────────┤\n",
"│ ('Assamese', 0) │ 1421 │ 0.2% │\n",
"├──────────────────────┼─────────────┼──────────────┤\n",
"│ ('Assamese', 1) │ 1220 │ 0.2% │\n",
"├──────────────────────┼─────────────┼──────────────┤\n",
"│ ---Total--- │ 631789 │ 100.0% │\n",
"╘══════════════════════╧═════════════╧══════════════╛\n",
"\n",
"\n",
"*** DEV LANGUAGE-LABEL PAIR FREQUENCIES ***\n",
"\n",
"╒══════════════════════╤═════════════╤══════════════╕\n",
"│ language_and_label │ frequency │ percentage │\n",
"╞══════════════════════╪═════════════╪══════════════╡\n",
"│ ('Hindi', 1) │ 7688 │ 23.1% │\n",
"├──────────────────────┼─────────────┼──────────────┤\n",
"│ ('Hindi', 0) │ 7672 │ 23.1% │\n",
"├──────────────────────┼─────────────┼──────────────┤\n",
"│ ('Telugu', 1) │ 2428 │ 7.3% │\n",
"├──────────────────────┼─────────────┼──────────────┤\n",
"│ ('Telugu', 0) │ 2423 │ 7.3% │\n",
"├──────────────────────┼─────────────┼──────────────┤\n",
"│ ('Marathi', 0) │ 2234 │ 6.7% │\n",
"├──────────────────────┼─────────────┼──────────────┤\n",
"│ ('Tamil', 0) │ 1740 │ 5.2% │\n",
"├──────────────────────┼─────────────┼──────────────┤\n",
"│ ('Tamil', 1) │ 1735 │ 5.2% │\n",
"├──────────────────────┼─────────────┼──────────────┤\n",
"│ ('Malayalam', 0) │ 1588 │ 4.8% │\n",
"├──────────────────────┼─────────────┼──────────────┤\n",
"│ ('Marathi', 1) │ 1368 │ 4.1% │\n",
"├──────────────────────┼─────────────┼──────────────┤\n",
"│ ('Bengali', 0) │ 571 │ 1.7% │\n",
"├──────────────────────┼─────────────┼──────────────┤\n",
"│ ('Bengali', 1) │ 570 │ 1.7% │\n",
"├──────────────────────┼─────────────┼──────────────┤\n",
"│ ('Malayalam', 1) │ 461 │ 1.4% │\n",
"├──────────────────────┼─────────────┼──────────────┤\n",
"│ ('Kannada', 1) │ 349 │ 1.0% │\n",
"├──────────────────────┼─────────────┼──────────────┤\n",
"│ ('Kannada', 0) │ 348 │ 1.0% │\n",
"├──────────────────────┼─────────────┼──────────────┤\n",
"│ ('Odia', 1) │ 275 │ 0.8% │\n",
"├──────────────────────┼─────────────┼──────────────┤\n",
"│ ('Odia', 0) │ 274 │ 0.8% │\n",
"├──────────────────────┼─────────────┼──────────────┤\n",
"│ ('Haryanvi', 1) │ 221 │ 0.7% │\n",
"├──────────────────────┼─────────────┼──────────────┤\n",
"│ ('Gujarati', 0) │ 221 │ 0.7% │\n",
"├──────────────────────┼─────────────┼──────────────┤\n",
"│ ('Haryanvi', 0) │ 220 │ 0.7% │\n",
"├──────────────────────┼─────────────┼──────────────┤\n",
"│ ('Gujarati', 1) │ 220 │ 0.7% │\n",
"├──────────────────────┼─────────────┼──────────────┤\n",
"│ ('Bhojpuri', 0) │ 146 │ 0.4% │\n",
"├──────────────────────┼─────────────┼──────────────┤\n",
"│ ('Bhojpuri', 1) │ 144 │ 0.4% │\n",
"├──────────────────────┼─────────────┼──────────────┤\n",
"│ ('Rajasthani', 0) │ 109 │ 0.3% │\n",
"├──────────────────────┼─────────────┼──────────────┤\n",
"│ ('Rajasthani', 1) │ 109 │ 0.3% │\n",
"├──────────────────────┼─────────────┼──────────────┤\n",
"│ ('Assamese', 0) │ 75 │ 0.2% │\n",
"├──────────────────────┼─────────────┼──────────────┤\n",
"│ ('Assamese', 1) │ 64 │ 0.2% │\n",
"├──────────────────────┼─────────────┼──────────────┤\n",
"│ ---Total--- │ 33253 │ 100.0% │\n",
"╘══════════════════════╧═════════════╧══════════════╛\n"
]
}
],
"source": [
"print(f'\\n\\n*** TRAIN LANGUAGE-LABEL PAIR FREQUENCIES ***\\n')\n",
"tabulify(df=prepared_train_df, col_name='language_and_label')\n",
"\n",
"print(f'\\n\\n*** DEV LANGUAGE-LABEL PAIR FREQUENCIES ***\\n')\n",
"tabulify(df=prepared_dev_df, col_name='language_and_label')"
],
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n"
}
}
},
{
"cell_type": "code",
"execution_count": 16,
"outputs": [],
"source": [],
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n"
}
}
}
],
"metadata": {
"kernelspec": {
"name": "python3",
"language": "python",
"display_name": "Python 3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 2
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython2",
"version": "2.7.6"
}
},
"nbformat": 4,
"nbformat_minor": 0
}