{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "#import numpy as np\n",
    "from scipy.special import xlogy\n",
    "import pandas as pd\n",
    "import matplotlib.pyplot as plt\n",
    "from numpy import log2\n",
    "import bitstring as bt\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "morse = pd.read_csv('IMC.csv', dtype = {'Letter':str, 'Code':str})\n",
    "morse_code_dict = {morse['Letter'][a]:morse['Code'][a] for a in range(len(morse))}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Letter</th>\n",
       "      <th>Code</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>A</td>\n",
       "      <td>1011100000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>B</td>\n",
       "      <td>11101010100000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>C</td>\n",
       "      <td>1110101110100000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>D</td>\n",
       "      <td>111010100000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>E</td>\n",
       "      <td>100000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>F</td>\n",
       "      <td>10101110100000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>G</td>\n",
       "      <td>11101110100000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>H</td>\n",
       "      <td>101010100000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>I</td>\n",
       "      <td>10100000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>J</td>\n",
       "      <td>101110111011100000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10</th>\n",
       "      <td>K</td>\n",
       "      <td>11101011100000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>11</th>\n",
       "      <td>L</td>\n",
       "      <td>10111010100000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>12</th>\n",
       "      <td>M</td>\n",
       "      <td>111011100000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>13</th>\n",
       "      <td>N</td>\n",
       "      <td>1110100000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>14</th>\n",
       "      <td>O</td>\n",
       "      <td>1110111011100000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>15</th>\n",
       "      <td>P</td>\n",
       "      <td>1011101110100000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>16</th>\n",
       "      <td>Q</td>\n",
       "      <td>111011101011100000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>17</th>\n",
       "      <td>R</td>\n",
       "      <td>101110100000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>18</th>\n",
       "      <td>S</td>\n",
       "      <td>1010100000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>19</th>\n",
       "      <td>T</td>\n",
       "      <td>11100000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>20</th>\n",
       "      <td>U</td>\n",
       "      <td>101011100000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>21</th>\n",
       "      <td>V</td>\n",
       "      <td>10101011100000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>22</th>\n",
       "      <td>W</td>\n",
       "      <td>10111011100000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>23</th>\n",
       "      <td>X</td>\n",
       "      <td>1110101011100000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>24</th>\n",
       "      <td>Y</td>\n",
       "      <td>111010111011100000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>25</th>\n",
       "      <td>Z</td>\n",
       "      <td>1110111010100000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>26</th>\n",
       "      <td>_</td>\n",
       "      <td>00000000000000</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   Letter                   Code\n",
       "0       A          1011100000000\n",
       "1       B      11101010100000000\n",
       "2       C    1110101110100000000\n",
       "3       D        111010100000000\n",
       "4       E              100000000\n",
       "5       F      10101110100000000\n",
       "6       G      11101110100000000\n",
       "7       H        101010100000000\n",
       "8       I            10100000000\n",
       "9       J  101110111011100000000\n",
       "10      K      11101011100000000\n",
       "11      L      10111010100000000\n",
       "12      M        111011100000000\n",
       "13      N          1110100000000\n",
       "14      O    1110111011100000000\n",
       "15      P    1011101110100000000\n",
       "16      Q  111011101011100000000\n",
       "17      R        101110100000000\n",
       "18      S          1010100000000\n",
       "19      T            11100000000\n",
       "20      U        101011100000000\n",
       "21      V      10101011100000000\n",
       "22      W      10111011100000000\n",
       "23      X    1110101011100000000\n",
       "24      Y  111010111011100000000\n",
       "25      Z    1110111010100000000\n",
       "26      _         00000000000000"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "morse"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "#Implements the Huffman algorithm to construct an optimal code\n",
    "#Not the most efficient implementation (due to the memory management of dicts)\n",
    "def huffman(data_dict):\n",
    "    working_dict = data_dict.copy()\n",
    "    codewords = {x[0]:bt.Bits(bin='') for x in data_dict.keys()}\n",
    "    while(len(working_dict)>1):\n",
    "        lowest_ind= min(working_dict, key=working_dict.get) #get entry with smallest counts\n",
    "        lowest_count = working_dict.pop(lowest_ind)\n",
    "        second_ind = min(working_dict, key=working_dict.get) # get entry with second smallest counts\n",
    "        second_count = working_dict.pop(second_ind)\n",
    "\n",
    "        merge_ind = lowest_ind + second_ind    #build concatenated index\n",
    "        merge_count = lowest_count+ second_count\n",
    "        working_dict[merge_ind] = merge_count\n",
    "\n",
    "        #Prefix existing codewords to account for the combined entry\n",
    "        for x in lowest_ind:\n",
    "            codewords[x] = bt.Bits(bin='1')+codewords[x] #Notice we prefix - this gives an instantaneous code\n",
    "        for x in second_ind:\n",
    "            codewords[x] = bt.Bits(bin='0')+codewords[x]\n",
    "    return(codewords)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "data_single = pd.read_csv('single_counts.csv', index_col=0)['Count']\n",
    "data_dict_single = {(x,): data_single[x] for x in data_single.index}               #Efficiency suggests removing codewords with zero counts, but this may make some messages impossible to encode\n",
    "data_dict_indep_pairs = {(x+y,): data_single[x]*data_single[y] for x in data_single.index for y in data_single.index}               #Efficiency suggests removing codewords with zero counts, but this may make some messages impossible to encode"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Average Morse length: 14.071901290986666\n"
     ]
    }
   ],
   "source": [
    "average_length_morse = sum([data_single[x]*len(morse_code_dict[x]) for x in morse_code_dict])/sum(data_single)\n",
    "print('Average Morse length: ' +str(average_length_morse))\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{'A': '0101', 'B': '011111', 'C': '001100', 'D': '01000', 'E': '0000', 'F': '001110', 'G': '011101', 'H': '1011', 'I': '1001', 'J': '0011110101', 'K': '00111100', 'L': '01001', 'M': '000111', 'N': '1000', 'O': '0110', 'P': '011110', 'Q': '00111101000', 'R': '00010', 'S': '1010', 'T': '0010', 'U': '000110', 'V': '0011111', 'W': '001101', 'X': '001111011', 'Y': '011100', 'Z': '00111101001', '_': '11'}\n",
      "Average length per character: 4.122016250811564\n"
     ]
    }
   ],
   "source": [
    "single_codewords = huffman(data_dict_single)\n",
    "print({x:single_codewords[x].bin for x in single_codewords})\n",
    "average_length = sum([data_single[x]*len(single_codewords[x].bin) for x in single_codewords])/sum(data_single)\n",
    "print('Average length per character: ' + str(average_length))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{'AA': '01011000', 'AB': '1011110000', 'AC': '0001010011', 'AD': '010011011', 'AE': '1011101', 'AF': '0010110111', 'AG': '0101111110', 'AH': '10111110', 'AI': '10000100', 'AJ': '00101001100010', 'AK': '10100111101', 'AL': '010101111', 'AM': '0000000010', 'AN': '01111000', 'AO': '01100100', 'AP': '1001001011', 'AQ': '10100111011110', 'AR': '11010110', 'AS': '10100110', 'AT': '00100001', 'AU': '101111111', 'AV': '01100010000', 'AW': '0010001000', 'AX': '00010100000111', 'AY': '0101010001', 'AZ': '100001101001110', 'A_': '110011', 'BA': '1010111111', 'BB': '0010100110000', 'BC': '011100000111', 'BD': '10100100001', 'BE': '0010010011', 'BF': '100100111111', 'BG': '110100101001', 'BH': '00101001011', 'BI': '00000000110', 'BJ': '1000011010011110', 'BK': '00011001100110', 'BL': '10111100010', 'BM': '011000100010', 'BN': '1111110010', 'BO': '1101011110', 'BP': '0000100101011', 'BQ': '00011001100001111', 'BR': '01001011100', 'BS': '00010111010', 'BT': '0111100111', 'BU': '001011001100', 'BV': '1101001011111', 'BW': '100000100001', 'BX': '0110101110111010', 'BY': '101011011101', 'BZ': '000000001110111101', 'B_': '001011101', 'CA': '0001010010', 'CB': '011100000110', 'CC': '11100000011', 'CD': '00001001001', 'CE': '011100010', 'CF': '11111100000', 'CG': '000111110011', 'CH': '0111001001', 'CI': '0101010101', 'CJ': '111100110110101', 'CK': '0110101110110', 'CL': '00010100010', 'CM': '10101110010', 'CN': '0101001111', 'CO': '0010001010', 'CP': '010110111011', 'CQ': '0110101110111000', 'CR': '1000011001', 'CS': '0110100100', 'CT': '111100000', 'CU': '01110010111', 'CV': '0010001011000', 'CW': '11110011010', 'CX': '111000110101000', 'CY': '000100101001', 'CZ': '01010010100111011', 'C_': '01110110', 'DA': '010011010', 'DB': '10100100000', 'DC': '00001001000', 'DD': '0010100100', 'DE': '10100010', 'DF': '00011001110', 'DG': '01010100100', 'DH': '101000111', 'DI': '011011111', 'DJ': '000110011000000', 'DK': '100110100111', 'DL': '0100110010', 'DM': '1111001100', 'DN': '011010101', 'DO': '010101100', 'DP': '01110011010', 'DQ': '100001101011111', 'DR': '101011110', 'DS': '100100100', 'DT': '000100111', 'DU': '1010011100', 'DV': '010101001011', 'DW': '00010010111', 'DX': '000000001110100', 'DY': '01001000010', 'DZ': '0111000001010100', 'D_': '1010101', 'EA': '1011100', 'EB': '0010010010', 'EC': '011100001', 'ED': '10100001', 'EE': '0010011', 'EF': '100110010', 'EG': '110100001', 'EH': '00101000', 'EI': '00000010', 'EJ': '1000011010101', 'EK': '00011111000', 'EL': '10110001', 'EM': '011000011', 'EN': '1111101', 'EO': '1101101', 'EP': '0000100111', 'EQ': '00011001100011', 'ER': '01001010', 'ES': '00010110', 'ET': '0111110', 'EU': '001010110', 'EV': '1101010011', 'EW': '011111111', 'EX': '0111000001001', 'EY': '101011001', 'EZ': '000000001110110', 'E_': '001101', 'FA': '0010110110', 'FB': '100100111110', 'FC': '11110111111', 'FD': '00011001101', 'FE': '100110001', 'FF': '000101000000', 'FG': '010011001100', 'FH': '1001100111', 'FI': '0110011010', 'FJ': '0000111111101001', 'FK': '1000011010010', 'FL': '00101101000', 'FM': '11100010101', 'FN': '0110010111', 'FO': '0101001001', 'FP': '011010111010', 'FQ': '1000011010011000', 'FR': '1010010010', 'FS': '0111100101', 'FT': '0000100011', 'FU': '10011010101', 'FV': '0101001010001', 'FW': '000010010100', 'FX': '111111000010001', 'FY': '001000101111', 'FZ': '01101011010000011', 'F_': '10011110', 'GA': '0101111101', 'GB': '110100101000', 'GC': '000111110010', 'GD': '01010010111', 'GE': '110100000', 'GF': '010010111011', 'GG': '011001100010', 'GH': '1101001001', 'GI': '1001001101', 'GJ': '0100101110100010', 'GK': '1011110001101', 'GL': '01011111000', 'GM': '000010010111', 'GN': '1001000100', 'GO': '0110100110', 'GP': '100111110101', 'GQ': '1011110001110000', 'GR': '1110001100', 'GS': '1010110100', 'GT': '0010101110', 'GU': '11010010110', 'GV': '0110011000111', 'GW': '001010011010', 'GX': '0001100110000110', 'GY': '010110111001', 'GZ': '10011111011011000', 'G_': '11011111', 'HA': '10111101', 'HB': '00101001010', 'HC': '0111001000', 'HD': '101000110', 'HE': '00100101', 'HF': '1001100110', 'HG': '1101001000', 'HH': '001010100', 'HI': '000000111', 'HJ': '10000110101101', 'HK': '000111111001', 'HL': '101101001', 'HM': '0110001010', 'HN': '11111110', 'HO': '11011101', 'HP': '00001111101', 'HQ': '000110011001110', 'HR': '010010110', 'HS': '000110000', 'HT': '10000001', 'HU': '0010110000', 'HV': '11010111001', 'HW': '1000001010', 'HX': '01110000010100', 'HY': '1010110110', 'HZ': '0000000011101110', 'H_': '0011110', 'IA': '10000011', 'IB': '00000000001', 'IC': '0101010100', 'ID': '011011110', 'IE': '00000001', 'IF': '0110011001', 'IG': '1001001100', 'IH': '000000110', 'II': '11010001', 'IJ': '01100101010100', 'IK': '11110011111', 'IL': '011111101', 'IM': '0011100111', 'IN': '11000001', 'IO': '10011100', 'IP': '1110000000', 'IQ': '11110011011111', 'IR': '000100001', 'IS': '11110010', 'IT': '01011101', 'IU': '0000100001', 'IV': '10011010010', 'IW': '0101111011', 'IX': '01010010100110', 'IY': '0111001111', 'IZ': '110100101110110', 'I_': '0000110', 'JA': '00100010110011', 'JB': '1000011010011001', 'JC': '111100110110100', 'JD': '000101000001011', 'JE': '1000011010100', 'JF': '0000111111101000', 'JG': '0010100110001101', 'JH': '10000110101100', 'JI': '01011111001011', 'JJ': '1110001101010111011', 'JK': '10000110100001000', 'JL': '001000101100101', 'JM': '110100101110100', 'JN': '01011111001010', 'JO': '01001011101010', 'JP': '0110101101000000', 'JQ': '10011111011011001101', 'JR': '10011111011001', 'JS': '01110000010111', 'JT': '1111110000110', 'JU': '100001101011101', 'JV': '01010010100111001', 'JW': '111111000010011', 'JX': '1110001101010111010', 'JY': '0001100110010001', 'JZ': '111000110101011100000', 'J_': '100100111101', 'KA': '10100111100', 'KB': '00011001100101', 'KC': '0110101101001', 'KD': '100110100110', 'KE': '00011001111', 'KF': '1000011010001', 'KG': '1011110001100', 'KH': '000111111000', 'KI': '11110011110', 'KJ': '01110011000110111', 'KK': '000110011000001', 'KL': '101001110110', 'KM': '0101101111001', 'KN': '11110011101', 'KO': '11000110001', 'KP': '00000000111000', 'KQ': '000011111110110011', 'KR': '001011010100', 'KS': '000100101011', 'KT': '01110010101', 'KU': '0001111110101', 'KV': '10111100011110', 'KW': '0111001100010', 'KX': '01101011010000101', 'KY': '1010011101001', 'KZ': '111000110101011111', 'K_': '0010010001', 'LA': '010101110', 'LB': '10101111101', 'LC': '00010100001', 'LD': '0100110001', 'LE': '10110000', 'LF': '00101100111', 'LG': '01011011111', 'LH': '101101000', 'LI': '011111100', 'LJ': '001000101100100', 'LK': '101001110101', 'LL': '0101010011', 'LM': '00000000000', 'LN': '011101001', 'LO': '011000001', 'LP': '10010001011', 'LQ': '101001110111000', 'LR': '110100111', 'LS': '101000001', 'LT': '000111101', 'LU': '1011110011', 'LV': '010111111100', 'LW': '00011111011', 'LX': '000101000001010', 'LY': '01010010110', 'LZ': '1000011010000110', 'L_': '1100010', 'MA': '0000000001', 'MB': '010111111101', 'MC': '10101110001', 'MD': '1111000011', 'ME': '011000010', 'MF': '11100010100', 'MG': '000010010110', 'MH': '0110001001', 'MI': '0011100110', 'MJ': '110100101110011', 'MK': '0101101111000', 'ML': '1111110011', 'MM': '10011010000', 'MN': '0010110010', 'MO': '0001000101', 'MP': '010011001110', 'MQ': '0101111100100001', 'MR': '0110101100', 'MS': '0101011011', 'MT': '110001111', 'MU': '01100110000', 'MV': '0000111111111', 'MW': '11000110111', 'MX': '101001110111111', 'MY': '11110111110', 'MZ': '01001011101000111', 'M_': '01101000', 'NA': '01110111', 'NB': '1111110001', 'NC': '0101001110', 'ND': '011010100', 'NE': '1111100', 'NF': '0110010110', 'NG': '1000011011', 'NH': '11111101', 'NI': '11000000', 'NJ': '01011111001001', 'NK': '11110011100', 'NL': '011101000', 'NM': '0010110001', 'NN': '10110101', 'NO': '10010101', 'NP': '1101010001', 'NQ': '11110011011100', 'NR': '000011101', 'NS': '11101111', 'NT': '01011010', 'NU': '111111111', 'NV': '10010011101', 'NW': '0101101101', 'NX': '01010010100100', 'NY': '0111000111', 'NZ': '110100101110010', 'N_': '0000011', 'OA': '01100011', 'OB': '1101011101', 'OC': '0010001001', 'OD': '010101011', 'OE': '1101100', 'OF': '0101001000', 'OG': '0110100101', 'OH': '11011100', 'OI': '10011011', 'OJ': '01001011101001', 'OK': '11000110000', 'OL': '011000000', 'OM': '0001000100', 'ON': '10010100', 'OO': '01101100', 'OP': '1001111111', 'OQ': '11010010111000', 'OR': '11101101', 'OS': '10110011', 'OT': '00111000', 'OU': '111000010', 'OV': '01101011100', 'OW': '0011100101', 'OX': '00011111101101', 'OY': '0101111001', 'OZ': '100111110110101', 'O_': '111010', 'PA': '1001001010', 'PB': '0000100101010', 'PC': '010110111010', 'PD': '01110011001', 'PE': '0000100110', 'PF': '011010110101', 'PG': '100111110100', 'PH': '00001111100', 'PI': '1101011111', 'PJ': '0110010101010111', 'PK': '1111110000111', 'PL': '10010001010', 'PM': '010011001101', 'PN': '1101010000', 'PO': '1001111110', 'PP': '111000110100', 'PQ': '00000000111001001', 'PR': '00010111000', 'PS': '1111011110', 'PT': '0110010100', 'PU': '000011111101', 'PV': '1001111101111', 'PW': '011001010100', 'PX': '0101001010011111', 'PY': '100000100011', 'PZ': '11100011010101101', 'P_': '000100100', 'QA': '10100111011101', 'QB': '00011001100001110', 'QC': '0110101101000011', 'QD': '100001101011110', 'QE': '00011001100010', 'QF': '1000011010000111', 'QG': '1010011101110011', 'QH': '000110011001001', 'QI': '11110011011110', 'QJ': '10011111011011001100', 'QK': '000011111110110010', 'QL': '100111110110111', 'QM': '0101111100100000', 'QN': '11110011011011', 'QO': '10111100011111', 'QP': '00000000111001000', 'QQ': '11100011010101110001', 'QR': '010010111010000', 'QS': '000101000001000', 'QT': '01110011000001', 'QU': '0001100110011111', 'QV': '10111100011100011', 'QW': '0111001100011010', 'QX': '01110011000110110011', 'QY': '1010011101110010', 'QZ': '0111001100011011000100', 'Q_': '0010001011011', 'RA': '11010101', 'RB': '01001000011', 'RC': '1000011000', 'RD': '101011101', 'RE': '01001001', 'RF': '1010010001', 'RG': '1110001011', 'RH': '010010001', 'RI': '000100000', 'RJ': '10011111011000', 'RK': '001011001101', 'RL': '110100110', 'RM': '0110100111', 'RN': '000011100', 'RO': '11101100', 'RP': '00010100011', 'RQ': '001010011000111', 'RR': '010100110', 'RS': '001000111', 'RT': '10010111', 'RU': '0100110000', 'RV': '11110000100', 'RW': '1001100001', 'RX': '10000110100000', 'RY': '1100011010', 'RZ': '0001100110000100', 'R_': '0101000', 'SA': '10100101', 'SB': '00010111001', 'SC': '0110011011', 'SD': '100100011', 'SE': '00010101', 'SF': '0111100100', 'SG': '1010011111', 'SH': '000101111', 'SI': '11110001', 'SJ': '01110000010110', 'SK': '000100101010', 'SL': '101000000', 'SM': '0101011010', 'SN': '11101110', 'SO': '10110010', 'SP': '1111011101', 'SQ': '000011111110111', 'SR': '001000110', 'SS': '000011110', 'ST': '01101110', 'SU': '0001100101', 'SV': '10101111100', 'SW': '0111000000', 'SX': '01100101010111', 'SY': '1001111100', 'SZ': '111100110111011', 'S_': '0001110', 'TA': '00100000', 'TB': '0111100110', 'TC': '111000111', 'TD': '000100110', 'TE': '0111101', 'TF': '0000100010', 'TG': '0010100111', 'TH': '10000000', 'TI': '01011100', 'TJ': '1111110000101', 'TK': '01110010100', 'TL': '000111100', 'TM': '110001110', 'TN': '01011001', 'TO': '00101111', 'TP': '0110001011', 'TQ': '01110011000000', 'TR': '10010110', 'TS': '01101101', 'TT': '1111010', 'TU': '100001011', 'TV': '01001000000', 'TW': '111101101', 'TX': '1111001101100', 'TY': '0001100011', 'TZ': '011001010101010', 'T_': '100011', 'UA': '101111110', 'UB': '001010011011', 'UC': '01110010110', 'UD': '1010010011', 'UE': '001010101', 'UF': '10011010100', 'UG': '11010010101', 'UH': '0010101111', 'UI': '0000100000', 'UJ': '100001101011100', 'UK': '0001111110100', 'UL': '1011110010', 'UM': '01100101011', 'UN': '111111110', 'UO': '111000001', 'UP': '000011111100', 'UQ': '0001100110011110', 'UR': '0100101111', 'US': '0001100100', 'UT': '100001010', 'UU': '00101101001', 'UV': '111000000101', 'UW': '10000010111', 'UX': '011100110001100', 'UY': '10101110000', 'UZ': '00001111111011000', 'U_': '00111111', 'VA': '01011111111', 'VB': '1101001011110', 'VC': '0001111110111', 'VD': '010101001010', 'VE': '1101010010', 'VF': '0101001010000', 'VG': '0110011000110', 'VH': '11010111000', 'VI': '10011010001', 'VJ': '01010010100111000', 'VK': '10111100011101', 'VL': '010111110011', 'VM': '0000111111110', 'VN': '10010011100', 'VO': '01101011011', 'VP': '1001111101110', 'VQ': '10111100011100010', 'VR': '11100011011', 'VS': '10101110011', 'VT': '00101101011', 'VU': '111000000100', 'VV': '01101011010001', 'VW': '0010110101011', 'VX': '00101001100011001', 'VY': '0101101111011', 'VZ': '100111110110110010', 'V_': '111000100', 'WA': '0001111111', 'WB': '100000100000', 'WC': '11110000101', 'WD': '00010010110', 'WE': '011111110', 'WF': '000000001111', 'WG': '001010011001', 'WH': '1000001001', 'WI': '0101111010', 'WJ': '111111000010010', 'WK': '0111001100001', 'WL': '00011111010', 'WM': '11000110110', 'WN': '0101101100', 'WO': '0011100100', 'WP': '011000100011', 'WQ': '0111000001010101', 'WR': '1001100000', 'WS': '0110101111', 'WT': '111101100', 'WU': '10000010110', 'WV': '0010110101010', 'WW': '11110111000', 'WX': '111000110101010', 'WY': '000101110111', 'WZ': '01100101010101101', 'W_': '10010000', 'XA': '00010100000110', 'XB': '0110101110111001', 'XC': '110100101110111', 'XD': '000000001110011', 'XE': '0111000001000', 'XF': '111111000010000', 'XG': '0001100110000101', 'XH': '01101011101111', 'XI': '01010010100101', 'XJ': '1110001101010111001', 'XK': '01101011010000100', 'XL': '000101000001001', 'XM': '101001110111110', 'XN': '01001011101011', 'XO': '00011111101100', 'XP': '0101001010011110', 'XQ': '01110011000110110010', 'XR': '01110011000111', 'XS': '01100101010110', 'XT': '1110001101011', 'XU': '011100000101011', 'XV': '00101001100011000', 'XW': '111000110101001', 'XX': '1001111101101100111', 'XY': '0000111111101011', 'XZ': '011100110001101100001', 'X_': '011100110111', 'YA': '0101010000', 'YB': '101011011100', 'YC': '000100101000', 'YD': '01001000001', 'YE': '101011000', 'YF': '001000101110', 'YG': '010110111000', 'YH': '1010110101', 'YI': '0111001110', 'YJ': '0001100110010000', 'YK': '1010011101000', 'YL': '01010010101', 'YM': '11110111001', 'YN': '0111000110', 'YO': '0101111000', 'YP': '100000100010', 'YQ': '1001111101101101', 'YR': '1100011001', 'YS': '1001101011', 'YT': '0001100010', 'YU': '10101101111', 'YV': '0101101111010', 'YW': '000101110110', 'YX': '0000111111101010', 'YY': '010011001111', 'YZ': '10000110100111110', 'Y_': '10110111', 'ZA': '100001101001101', 'ZB': '000000001110111100', 'ZC': '01010010100111010', 'ZD': '0110101110111011', 'ZE': '000000001110101', 'ZF': '01101011010000010', 'ZG': '10000110100111111', 'ZH': '0000000011100101', 'ZI': '110100101110101', 'ZJ': '011100110001101100011', 'ZK': '111000110101011110', 'ZL': '1000011010000101', 'ZM': '01001011101000110', 'ZN': '101111000111001', 'ZO': '100111110110100', 'ZP': '11100011010101100', 'ZQ': '111000110101011100001', 'ZR': '0000111111101101', 'ZS': '111100110111010', 'ZT': '010111110010001', 'ZU': '00000000111011111', 'ZV': '011100110001101101', 'ZW': '01100101010101100', 'ZX': '011100110001101100000', 'ZY': '10000110100001001', 'ZZ': '0111001100011011000101', 'Z_': '00001111111001', '_A': '110010', '_B': '001011100', '_C': '01110101', '_D': '1010100', '_E': '001100', '_F': '10011101', '_G': '11011110', '_H': '0011101', '_I': '0000101', '_J': '100100111100', '_K': '0010010000', '_L': '1100001', '_M': '01100111', '_N': '0000010', '_O': '111001', '_P': '000100011', '_Q': '0010001011010', '_R': '0100111', '_S': '0001101', '_T': '100010', '_U': '00111110', '_V': '111000011', '_W': '10000111', '_X': '011100110110', '_Y': '10110110', '_Z': '00001111111000', '__': '01000'}\n",
      "Average length per character (assuming independence): 4.094180154590685\n"
     ]
    }
   ],
   "source": [
    "double_indept_codewords= huffman(data_dict_indep_pairs)\n",
    "print({x:double_indept_codewords[x].bin for x in double_indept_codewords})\n",
    "average_length = sum([data_dict_indep_pairs[x]*len(double_indept_codewords[x[0]].bin) for x in data_dict_indep_pairs])/sum([data_dict_indep_pairs[x] for x in data_dict_indep_pairs])\n",
    "print('Average length per character (assuming independence): '+ str(average_length/2))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "data_pairs = pd.read_csv('double_counts.csv', index_col=0, keep_default_na=False)['Count'] #need to deal with NA being read as not-a-number....\n",
    "data_dict_pairs = {(x,): data_pairs[x] for x in data_pairs.index}               #Efficiency suggests removing codewords with zero counts, but this may make some messages impossible to encode"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{'AA': '1010101000000010', 'AB': '0011001110', 'AC': '010110100', 'AD': '000011001', 'AE': '1101101011011', 'AF': '01011100000', 'AG': '0010101101', 'AH': '1011011011111', 'AI': '001001110', 'AJ': '11000111111100', 'AK': '0100101111', 'AL': '00110001', 'AM': '011011011', 'AN': '011100', 'AO': '0011001111000100', 'AP': '0101110001', 'AQ': '00011001110001110', 'AR': '1100000', 'AS': '1110010', 'AT': '0101011', 'AU': '1011010111', 'AV': '100000111', 'AW': '00101111100', 'AX': '11101111010001', 'AY': '010001111', 'AZ': '00011001110000', 'A_': '00100101', 'BA': '1010101110', 'BB': '1110111100101', 'BC': '01000110011000101111011', 'BD': '01001111001011010', 'BE': '01110110', 'BF': '000110011100011001100111', 'BG': '010110110011110111101', 'BH': '000001010100010000', 'BI': '01110111010', 'BJ': '00101100101011', 'BK': '000110011100011001100110', 'BL': '0000011100', 'BM': '00110011110001010', 'BN': '0001100111010110101', 'BO': '110110100', 'BP': '000110011100011001100101', 'BQ': '000110011100011001010000010011011', 'BR': '1010001000', 'BS': '111011110101', 'BT': '1011011010101', 'BU': '011011100', 'BV': '00011001110001100100', 'BW': '1010101000000011000', 'BX': '000110011100011001010000010011010', 'BY': '0110100100', 'BZ': '101101101011101000110', 'B_': '0011001111101', 'CA': '001100101', 'CB': '1100011111110100', 'CC': '000011100110', 'CD': '01000100000001101', 'CE': '10100011', 'CF': '01101101001011000110', 'CG': '0001100111000110111110', 'CH': '10110100', 'CI': '0101111000', 'CJ': '000110011100011001010000010011001', 'CK': '0101010000', 'CL': '1101101010', 'CM': '10110110101110100010', 'CN': '0110000000100100101', 'CO': '01101010', 'CP': '000110011100011001100100', 'CQ': '001011001001100', 'CR': '1110011011', 'CS': '000001010100011', 'CT': '0000111000', 'CU': '00000111010', 'CV': '0100010000000100011010', 'CW': '00110011110001011111', 'CX': '00011001110001100101000000', 'CY': '0011001111001', 'CZ': '110110101100111100', 'C_': '11101111000', 'DA': '0101101101', 'DB': '010110110011110101', 'DC': '11011010110011010', 'DD': '010011110011', 'DE': '01100011', 'DF': '010011110111011', 'DG': '101010111101', 'DH': '00101100100110110', 'DI': '001101010', 'DJ': '00101100100110100', 'DK': '10110110101001000', 'DL': '010001100100', 'DM': '00011001110100', 'DN': '011000000011', 'DO': '010111111', 'DP': '010001000000010000', 'DQ': '110110101100110111', 'DR': '00001100000', 'DS': '1000001101', 'DT': '00011001110101100', 'DU': '01101101010', 'DV': '1010101111110', 'DW': '10001011101000', 'DX': '000110011100011001100011', 'DY': '01101001010', 'DZ': '000110011100011001010000010011000', 'D_': '000100', 'EA': '00100100', 'EB': '0010111111000', 'EC': '100111111', 'ED': '1100100', 'EE': '11011011', 'EF': '1001111100', 'EG': '01011100001', 'EH': '0011001111111', 'EI': '0010110011', 'EJ': '111011110100000', 'EK': '0100111101101', 'EL': '10011100', 'EM': '010011111', 'EN': '0101001', 'EO': '11101111011', 'EP': '1000001100', 'EQ': '0100010000001', 'ER': '101110', 'ES': '1010000', 'ET': '11111011', 'EU': '0000111001111', 'EV': '110000110', 'EW': '00100010100', 'EX': '1001110110', 'EY': '0000011010', 'EZ': '0000010101000101', 'E_': '00111', 'FA': '0010101110', 'FB': '0100010000000100011001', 'FC': '0100010000000100011000', 'FD': '1010101000000011001011', 'FE': '111110100', 'FF': '1100011110', 'FG': '0010110010011010110111', 'FH': '010001100110001011001', 'FI': '111011111', 'FJ': '0001100111000110010101111', 'FK': '0001100111000110010101110', 'FL': '10011101110', 'FM': '0001100111000110110001', 'FN': '00101100100110101000', 'FO': '11000110', 'FP': '101010100000001100111', 'FQ': '000110011100011001010000010010111', 'FR': '0010001001', 'FS': '011000000010011', 'FT': '00101011001', 'FU': '00011001001', 'FV': '0001100111000110010101101', 'FW': '101101101011101001011', 'FX': '000110011100011001010000010010110', 'FY': '11000111111110', 'FZ': '0101101100111101101011', 'F_': '1010100', 'GA': '1000101111', 'GB': '1011011010100101', 'GC': '1010101000000011001010', 'GD': '0011001111000111', 'GE': '010001001', 'GF': '010110110011110100', 'GG': '111011110011', 'GH': '010111011', 'GI': '1100001110', 'GJ': '0001100111000110010101100', 'GK': '010110110011110111100', 'GL': '01101000001', 'GM': '10101010010100', 'GN': '010001100101', 'GO': '110011101', 'GP': '010110110011110111011', 'GQ': '000110011100011001010000010010101', 'GR': '0011010110', 'GS': '10001011011', 'GT': '00110011110111', 'GU': '10001011010', 'GV': '000110011100011001100010', 'GW': '1101101011001110', 'GX': '000110011100011001010000010010100', 'GY': '000110011100010', 'GZ': '100010111010010000100', 'G_': '00100001', 'HA': '0101000', 'HB': '010001100110000', 'HC': '0100010000000100010', 'HD': '00101100100110111', 'HE': '000111', 'HF': '101101101011110', 'HG': '0101101100111101101010', 'HH': '0000010101000100101', 'HI': '1010110', 'HJ': '01000110011000101111010', 'HK': '10001011101001000011', 'HL': '10101010010111', 'HM': '01101101001100', 'HN': '00110011111000', 'HO': '01100001', 'HP': '0101101100111101100', 'HQ': '011011010010011010', 'HR': '01000110001', 'HS': '0110110100111', 'HT': '0000111010', 'HU': '00000111011', 'HV': '101010100000001100110', 'HW': '0110110100100111', 'HX': '000110011100011001010000010010011', 'HY': '01010100011', 'HZ': '000110011100011001010000010010010', 'H_': '00100000', 'IA': '0111011110', 'IB': '000011000011', 'IC': '001001101', 'ID': '011000001', 'IE': '010110111', 'IF': '0001100101', 'IG': '101010110', 'IH': '1110111101000011', 'II': '0101101100110', 'IJ': '00011001110101101101', 'IK': '10101010101', 'IL': '11100111', 'IM': '001100100', 'IN': '101001', 'IO': '001011110', 'IP': '01110111011', 'IQ': '101010111110000', 'IR': '010001101', 'IS': '1001011', 'IT': '0110010', 'IU': '0000010101001', 'IV': '0010001000', 'IW': '0010110010011010111', 'IX': '0011001111010', 'IY': '000110011100011001100001', 'IZ': '0000110000101', 'I_': '10001000', 'JA': '1010101000001', 'JB': '000110011100011001010000010010001', 'JC': '000110011100011001010000010010000', 'JD': '000110011100011001010000010001111', 'JE': '011101111101', 'JF': '000110011100011001010000010001110', 'JG': '000110011100011001010000010001101', 'JH': '000110011100011001100000', 'JI': '000110011101010', 'JJ': '0010110010011010110110', 'JK': '000110011100011001010000010001100', 'JL': '000110011100011001010000010001011', 'JM': '000110011100011001010000010001010', 'JN': '000110011100011001011111', 'JO': '001000101011', 'JP': '01000110011000101111001', 'JQ': '000110011100011001010000010001001', 'JR': '010001100110001011000', 'JS': '000110011100011001010000010001000', 'JT': '000110011100011001010000010000111', 'JU': '10001011001', 'JV': '000110011100011001010000010000110', 'JW': '000110011100011001010000010000101', 'JX': '0001100111000110010101011', 'JY': '0001100111000110010101010', 'JZ': '0001100111000110010101001', 'J_': '00011001110001111', 'KA': '00101111111010', 'KB': '0010110010011010101', 'KC': '01101101001001101111', 'KD': '101101101011101001010', 'KE': '010111101', 'KF': '110001111111011', 'KG': '101010100000001101', 'KH': '100010111010010010', 'KI': '1001111101', 'KJ': '0101101100111101101001', 'KK': '00110011110001011110', 'KL': '00001110011101', 'KM': '01101101001101011', 'KN': '00011001111', 'KO': '000011000010000', 'KP': '010110110011110111010', 'KQ': '000110011100011001010000010000100', 'KR': '000001010100010001', 'KS': '11100110100', 'KT': '00011001110001101110', 'KU': '1101101011001100', 'KV': '0101101100111101101000', 'KW': '1110111101000010', 'KX': '000110011100011001010000010000011', 'KY': '10110110101101', 'KZ': '000110011100011001010000010000010', 'K_': '010101011', 'LA': '000001011', 'LB': '11011010110010', 'LC': '01000110011001', 'LD': '001100110', 'LE': '00011000', 'LF': '00101111101', 'LG': '011011010010111', 'LH': '001100111100010110', 'LI': '10011110', 'LJ': '000110011100011001010000010000001', 'LK': '100010111000', 'LL': '00101101', 'LM': '0100111101111', 'LN': '011011010011011', 'LO': '000001100', 'LP': '0110000000101', 'LQ': '0010110010011010110101', 'LR': '10101010000001', 'LS': '00000101011', 'LT': '01010100010', 'LU': '00110111100', 'LV': '100010111001', 'LW': '0010110010100', 'LX': '010001100110001010', 'LY': '001101110', 'LZ': '010001000000010001111', 'L_': '00001101', 'MA': '10101110', 'MB': '01101001011', 'MC': '01000100000001011', 'MD': '0110000000100100100', 'ME': '00001111', 'MF': '01011011001110', 'MG': '101101101011101000111', 'MH': '00011001110101101100', 'MI': '101000101', 'MJ': '01000110011000101111000', 'MK': '0100011001100010111111', 'ML': '000011100111000', 'MM': '11100110101', 'MN': '1110111101001', 'MO': '010010101', 'MP': '0110100111', 'MQ': '010001100110001000', 'MR': '0010101100001', 'MS': '01001111010', 'MT': '011011010010110100', 'MU': '00000101001', 'MV': '00101100100110101100', 'MW': '110110101100110110', 'MX': '000110011100011001011110', 'MY': '0000110001', 'MZ': '1010101000000011001001', 'M_': '10001010', 'NA': '0010100110', 'NB': '000011000010001', 'NC': '010111110', 'ND': '111111', 'NE': '00110100', 'NF': '010001110110', 'NG': '1011110', 'NH': '10101011111001', 'NI': '110011000', 'NJ': '00101100101010', 'NK': '01001011101', 'NL': '11000111110', 'NM': '01101101001000', 'NN': '01101101011', 'NO': '01000101', 'NP': '100010111010011', 'NQ': '01001111011100', 'NR': '00101111111011', 'NS': '010111001', 'NT': '00110000', 'NU': '001011001000', 'NV': '101101101110', 'NW': '001100111111000', 'NX': '0110110100110100', 'NY': '1011010110', 'NZ': '101101101011111', 'N_': '100001', 'OA': '10001011000', 'OB': '10101010011', 'OC': '01000100001', 'OD': '0010100001', 'OE': '001000101010', 'OF': '1110110', 'OG': '001011111111', 'OH': '0100111101100', 'OI': '01011011000', 'OJ': '010110110011111', 'OK': '00000110111', 'OL': '011011101', 'OM': '10101111', 'ON': '0101100', 'OO': '010010100', 'OP': '0110100001', 'OQ': '0110000000100101', 'OR': '0111010', 'OS': '101101100', 'OT': '10010100', 'OU': '0100001', 'OV': '0101111001', 'OW': '11001111', 'OX': '11000111111000', 'OY': '001011111101', 'OZ': '011011010010010', 'O_': '0100000', 'PA': '110011001', 'PB': '010001000000011000', 'PC': '1101101011001111011', 'PD': '11011010110011110101', 'PE': '001010001', 'PF': '0000010101000100100', 'PG': '010110110011110111001', 'PH': '11011010111', 'PI': '1010001001', 'PJ': '1010101000000011001000', 'PK': '011011010010110000', 'PL': '0000111011', 'PM': '00110011110001101', 'PN': '110001111111010111', 'PO': '101101010', 'PP': '00000101000', 'PQ': '000110011100011001010000010000000', 'PR': '011001100', 'PS': '010111010000', 'PT': '01101101000', 'PU': '00110111101', 'PV': '00011001110001100101000001111111', 'PW': '01100000001001000', 'PX': '0001100111000110010101000', 'PY': '1101101011000', 'PZ': '000110011100011001011101', 'P_': '0010101111', 'QA': '0001100111000110010100111', 'QB': '00011001110001100101000001111110', 'QC': '00011001110001100101000001111101', 'QD': '00011001110001100101000001111100', 'QE': '00011001110001100101000001111011', 'QF': '00011001110001100101000001111010', 'QG': '00011001110001100101000001111001', 'QH': '00011001110001100101000001111000', 'QI': '00011001110001100101000001110111', 'QJ': '00011001110001100101000001110110', 'QK': '00011001110001100101000001110101', 'QL': '00011001110001100101000001110100', 'QM': '00011001110001100101000001110011', 'QN': '0010110010011010110100', 'QO': '00011001110001100101000001110010', 'QP': '00011001110001100101000001110001', 'QQ': '00011001110001100101000001110000', 'QR': '00011001110001100101000001101111', 'QS': '000110011100011001011100', 'QT': '00011001110001100101000001101110', 'QU': '1100001111', 'QV': '00011001110001100101000001101101', 'QW': '0001100111000110010100110', 'QX': '00011001110001100101000001101100', 'QY': '00011001110001100101000001101011', 'QZ': '00011001110001100101000001101010', 'Q_': '100010111010010001', 'RA': '11000010', 'RB': '0100111100100', 'RC': '01101000000', 'RD': '110011011', 'RE': '0001101', 'RF': '101010111100', 'RG': '10101010100', 'RH': '00000101010000', 'RI': '01101111', 'RJ': '000001010100010011', 'RK': '01011101001', 'RL': '01001111000', 'RM': '00001110010', 'RN': '0110100110', 'RO': '01101011', 'RP': '101010100001', 'RQ': '10101010000000111', 'RR': '1100110101', 'RS': '001001100', 'RT': '010011101', 'RU': '1010101011', 'RV': '001011001011', 'RW': '1010101111101', 'RX': '10110110101110100100', 'RY': '110001110', 'RZ': '01000110011000101101', 'R_': '111010', 'SA': '010110101', 'SB': '01001111001010', 'SC': '1100110100', 'SD': '11000111111011', 'SE': '00000100', 'SF': '00101100100111', 'SG': '011011010010101', 'SH': '000001111', 'SI': '001010010', 'SJ': '1011011010111010000', 'SK': '100010111011', 'SL': '10110110100', 'SM': '010001110111', 'SN': '0100010000011', 'SO': '11101110', 'SP': '0100011100', 'SQ': '01100000001000', 'SR': '01000100000001010', 'SS': '010101010', 'ST': '1100010', 'SU': '111001100', 'SV': '110001111111010110', 'SW': '000001010101', 'SX': '0001100111000110010100101', 'SY': '0001100111011', 'SZ': '01101101001001101110', 'S_': '000010', 'TA': '001011101', 'TB': '11011010110011111', 'TC': '010001100111', 'TD': '0110110100101100010', 'TE': '1111000', 'TF': '00110011110110', 'TG': '011011010010110011', 'TH': '11010', 'TI': '00110110', 'TJ': '00011001110001101101', 'TK': '0001100111000110110000', 'TL': '0111011100', 'TM': '00110011111101', 'TN': '00011001110011', 'TO': '1001000', 'TP': '01101101001011011', 'TQ': '011000000010010011', 'TR': '010101001', 'TS': '111110101', 'TT': '0010001011', 'TU': '0010100111', 'TV': '01011011001111011111', 'TW': '01001011100', 'TX': '0001100111000110010100100', 'TY': '0011010111', 'TZ': '0100111100101100', 'T_': '000101', 'UA': '01110111111', 'UB': '10101010001', 'UC': '0110000001', 'UD': '01100000000', 'UE': '1100111000', 'UF': '0010110010010', 'UG': '0101110101', 'UH': '0001100111010110111', 'UI': '01000110000', 'UJ': '00110011110001011101', 'UK': '00011001110010', 'UL': '001011000', 'UM': '00011001000', 'UN': '001001111', 'UO': '00110011111001', 'UP': '0011011111', 'UQ': '1011011010111010011', 'UR': '01101100', 'US': '10111111', 'UT': '10010101', 'UU': '100010111010010011', 'UV': '1011011010111011', 'UW': '000110011100011011110', 'UX': '0100011001100011', 'UY': '110001111110011', 'UZ': '0001100111010111', 'U_': '011001101', 'VA': '01000111010', 'VB': '00011001110001100101000001101001', 'VC': '000110011100011001011011', 'VD': '00011001110001100101000001101000', 'VE': '1111100', 'VF': '00011001110001100101000001100111', 'VG': '00011001110001100101000001100110', 'VH': '00011001110001100101000001100101', 'VI': '0100010001', 'VJ': '00011001110001100101000001100100', 'VK': '00011001110001100101000001100011', 'VL': '0010110010011010100111', 'VM': '010110110011110111000', 'VN': '01000100000001000110111', 'VO': '001010110001', 'VP': '00011001110001100101000001100010', 'VQ': '00011001110001100101000001100001', 'VR': '0001100111010110100', 'VS': '0001100111000110011111', 'VT': '000110011100011001011010', 'VU': '0101101100111100', 'VV': '0001100111000110011110', 'VW': '00011001110001100101000001100000', 'VX': '00011001110001100101000001011111', 'VY': '10101010010101', 'VZ': '00011001110001100101000001011110', 'V_': '00001100001001', 'WA': '10000010', 'WB': '01101101001001100', 'WC': '010110110011110110111', 'WD': '10110110101000', 'WE': '11011001', 'WF': '011011010010100', 'WG': '01101101001011000111', 'WH': '10110111', 'WI': '10111110', 'WJ': '0001100111000110010100011', 'WK': '101010100000000', 'WL': '1110111100100', 'WM': '100010111010100010', 'WN': '1100111001', 'WO': '011010001', 'WP': '00110011110001011100', 'WQ': '00011001110001100101000001011101', 'WR': '010111010001', 'WS': '010110110010', 'WT': '10001011101010000', 'WU': '01001111001011011', 'WV': '00011001110001100101000001011100', 'WW': '011011010010110010', 'WX': '00011001110001100101000001011011', 'WY': '010011110111010', 'WZ': '00011001110001100110101', 'W_': '010011100', 'XA': '1011011011110', 'XB': '011011010010110101', 'XC': '0010101100000', 'XD': '00011001110001100101000001011010', 'XE': '0100010000010', 'XF': '100010111010100011', 'XG': '00011001110001100101000001011001', 'XH': '110001111110010', 'XI': '0100011001101', 'XJ': '00011001110001100101000001011000', 'XK': '00011001110001100101000001010111', 'XL': '10110110101110101', 'XM': '010110110011110110110', 'XN': '0001100111000110010100010', 'XO': '000110011100011000', 'XP': '100111011110', 'XQ': '000110011100011010', 'XR': '000110011100011001011001', 'XS': '000110011100011011001', 'XT': '100111011111', 'XU': '1000101110101011', 'XV': '1000101110100101', 'XW': '0010110010011010100110', 'XX': '101010100101101', 'XY': '10110110101001001', 'XZ': '00011001110001100101000001010110', 'X_': '0010111111001', 'YA': '1010101111111', 'YB': '1101101011010', 'YC': '1000101110101010', 'YD': '01000100000001001', 'YE': '1001110100', 'YF': '01101101001101010', 'YG': '11000111111101010', 'YH': '00110011110001100', 'YI': '011101111100', 'YJ': '0001100111000110011101', 'YK': '01000110011000101110', 'YL': '10001011101011', 'YM': '00110011110000', 'YN': '101101101010011', 'YO': '001011100', 'YP': '010011110010111', 'YQ': '1000101110100100001011', 'YR': '01000100000000', 'YS': '00000110110', 'YT': '0010111111100', 'YU': '1000101110100100000', 'YV': '11011010110011110100', 'YW': '11000111111010', 'YX': '0001100111000110011100', 'YY': '00011001110001100110100', 'YZ': '010001000000011001', 'Y_': '111000', 'ZA': '10110110101100', 'ZB': '0001100111000110111111', 'ZC': '01101101001001101101', 'ZD': '00011001110001100101000001010101', 'ZE': '101010100100', 'ZF': '00011001110001100101000001010100', 'ZG': '00011001110001100101000001010011', 'ZH': '0100011001100010111110', 'ZI': '000011100111001', 'ZJ': '0001100111000110010100001', 'ZK': '1000101110100100001010', 'ZL': '1000101110101001', 'ZM': '00011001110001100101000001010010', 'ZN': '01000100000001000110110', 'ZO': '101101101011100', 'ZP': '000110011100011001011000', 'ZQ': '00011001110001100101000001010001', 'ZR': '0001100111000110011011', 'ZS': '00011001110001100101000001010000', 'ZT': '001011001001101010010', 'ZU': '010001100110001001', 'ZV': '010001000000010001110', 'ZW': '01101101001001101100', 'ZX': '00011001110001100101000001001111', 'ZY': '0100010000000111', 'ZZ': '101010111110001', 'Z_': '001100111111001', '_A': '000000', '_B': '0100110', '_C': '1000000', '_D': '1100101', '_E': '01100010', '_F': '1001001', '_G': '01100111', '_H': '110111', '_I': '100110', '_J': '1001110101', '_K': '0001100110', '_L': '00100011', '_M': '0100100', '_N': '00101010', '_O': '111101', '_P': '1111001', '_Q': '10110110110', '_R': '10001001', '_S': '100011', '_T': '01111', '_U': '010010110', '_V': '0010100000', '_W': '101100', '_X': '11000111111111', '_Y': '11011000', '_Z': '101010100101100', '__': '00011001110001100101000001001110'}\n"
     ]
    }
   ],
   "source": [
    "double_codewords=huffman(data_dict_pairs)\n",
    "print({x:double_codewords[x].bin for x in double_codewords})\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Average length per character - independent pairs block code: 4.087857277979846\n",
      "Average length per character - paired Huffman code: 3.720488053270046\n"
     ]
    }
   ],
   "source": [
    "print('Average length per character - independent pairs block code: '\n",
    "      + str(sum([data_dict_pairs[x]*len(double_indept_codewords[x[0]].bin) for x in data_dict_pairs])/sum([data_dict_pairs[x] for x in data_dict_pairs])/2)\n",
    "      )\n",
    "\n",
    "print('Average length per character - paired Huffman code: '\n",
    "      + str(sum([data_dict_pairs[x]*len(double_codewords[x[0]].bin) for x in data_dict_pairs])/sum([data_dict_pairs[x] for x in data_dict_pairs])/2)\n",
    "      )"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.5"
  },
  "orig_nbformat": 4
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
