{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#import numpy as np\n",
    "from scipy.special import xlogy\n",
    "import pandas as pd\n",
    "import matplotlib.pyplot as plt\n",
    "from numpy import log2\n",
    "import bitstring as bt\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "data = pd.read_csv('single_counts.csv', index_col=0)['Count']\n",
    "data_dict = {(x,): data[x] for x in data.index}              \n",
    " #Efficiency suggests removing codewords with zero counts, but this may make some messages impossible to encode"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#Implements the Huffman algorithm to construct an optimal code\n",
    "#Not the most efficient implementation (due to the memory management of dicts)\n",
    "\n",
    "def huffman(data_dict):\n",
    "    working_dict = data_dict.copy()  # dict of tree segments that have already been processed\n",
    "    codewords = {x[0]:bt.Bits(bin='') for x in data_dict.keys()} # dict of codewords for each input symbol\n",
    "\n",
    "    while(len(working_dict)>1):\n",
    "        lowest_ind= min(working_dict, key=working_dict.get)  # find entry with smallest counts\n",
    "        lowest_count = working_dict.pop(lowest_ind)          # remove entry from dict\n",
    "        second_ind = min(working_dict, key=working_dict.get) # find entry with (second) smallest counts\n",
    "        second_count = working_dict.pop(second_ind)          # remove entry from dict\n",
    "\n",
    "        merge_ind = lowest_ind + second_ind                  # build concatenated codeword\n",
    "        merge_count = lowest_count+ second_count             # compute concatented probability\n",
    "        working_dict[merge_ind] = merge_count                # insert new codeword into dict\n",
    "\n",
    "        # Add prefixes to the codewords corresponding to the newly combined tree segments\n",
    "        for x in lowest_ind:\n",
    "            codewords[x] = bt.Bits(bin='1')+codewords[x] #Notice we prefix - this gives an instantaneous code\n",
    "        for x in second_ind:\n",
    "            codewords[x] = bt.Bits(bin='0')+codewords[x]\n",
    "    return(codewords)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "codewords = huffman(data_dict)\n",
    "print({x:codewords[x].bin for x in codewords})"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "average_length = sum([data[x]*len(codewords[x].bin) for x in codewords])/sum(data)\n",
    "average_length"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def encoder(message, codewords):\n",
    "    #Applies the Huffman code, assuming it is a fixed length blocks\n",
    "    #Pads with space if necessary\n",
    "    #Inefficient use of memory, but easy to follow\n",
    "    \n",
    "    output = bt.Bits(bin='')\n",
    "    block_len= len(list(codewords.keys())[0])\n",
    "    initial_message = message+\"\"\n",
    "    while(len(message)>=block_len):\n",
    "        block = message[:block_len]\n",
    "        output = output + codewords[block]\n",
    "        message = message[block_len:]\n",
    "    if message == '':\n",
    "        return (output, initial_message)\n",
    "    else:\n",
    "        return (output + codewords[message+'_'*(block_len-len(message))], initial_message+'_'*(block_len-len(message))) #returns both coded message and message, to see if padded\n",
    "    "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "message = 'THE_RAIN_IN_SPAIN'\n",
    "coded_message, message = encoder(message, codewords)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "coded_message.bin"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#ASCII coding\n",
    "''.join(format(ord(x), 'b') for x in message)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def decoder(coded_message, codewords):\n",
    "    decode_dict = {codewords[x]:x for x in codewords}\n",
    "    output = \"\"\n",
    "    max_codeword_len = max(len(codewords[x]) for x in codewords)\n",
    "    while(len(coded_message)>0):\n",
    "        code_len = 1\n",
    "        while(code_len>0):\n",
    "            assert(code_len <= max_codeword_len) #Throw an error if code_len is too long...\n",
    "            test_word= coded_message[:code_len]\n",
    "            if test_word in decode_dict:\n",
    "                output = output+decode_dict[test_word]\n",
    "                coded_message = coded_message[code_len:]\n",
    "                code_len = 0\n",
    "            else:\n",
    "                code_len = code_len+1\n",
    "\n",
    "    return(output)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "decoder(coded_message, codewords)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "error_location = 1\n",
    "error_message = list(coded_message.bin)\n",
    "error_message[error_location] = str((int(error_message[error_location])+1)%2)\n",
    "error_message = bt.Bits(bin=''.join(error_message))\n",
    "decoder(error_message, codewords)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.5"
  },
  "orig_nbformat": 4
 },
 "nbformat": 4,
 "nbformat_minor": 2
}