{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "5b251563",
   "metadata": {
    "id": "5b251563"
   },
   "source": [
    "# BPE Tokenization using `tiktoken`\n",
    "\n",
    "**BPE** tokenization is done to using statistics over a large corpus of text in order to determine how strings should be segmented into *subword* tokens. Here we use the `tiktoken` tokenizer from OpenAI."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "0f986314",
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "executionInfo": {
     "elapsed": 13,
     "status": "ok",
     "timestamp": 1756836696317,
     "user": {
      "displayName": "Razvan Bunescu",
      "userId": "08159777761660776328"
     },
     "user_tz": 240
    },
    "id": "0f986314",
    "outputId": "c0672edd-587d-4204-b442-468af1382729"
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[708, 39721, 1790, 436, 637, 81, 4628, 304, 78311, 24751, 420, 19367, 0]\n"
     ]
    }
   ],
   "source": [
    "# Uncomment this line if tiktoken is not yet installed on your machine.\n",
    "#!pip install tiktoken\n",
    "\n",
    "import tiktoken\n",
    "\n",
    "# To get the tokenizer corresponding to a specific model in the OpenAI API:\n",
    "enc = tiktoken.encoding_for_model(\"gpt-4\")\n",
    "\n",
    "# The .encode() method converts a text string into a list of token integers.\n",
    "ltokens = enc.encode(\"soooo much rrrracing in Kannapolis this Summer!\")\n",
    "print(ltokens)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "3cadd5c7",
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/",
     "height": 35
    },
    "executionInfo": {
     "elapsed": 6,
     "status": "ok",
     "timestamp": 1756836770758,
     "user": {
      "displayName": "Razvan Bunescu",
      "userId": "08159777761660776328"
     },
     "user_tz": 240
    },
    "id": "3cadd5c7",
    "outputId": "f2059859-8bc0-4678-899c-b555f0aff1ed"
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'soooo much rrrracing in Kannapolis this Summer!'"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# The .decode() method converts a list of token integers to a string.\n",
    "enc.decode(ltokens)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "ca0788cc",
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "executionInfo": {
     "elapsed": 4,
     "status": "ok",
     "timestamp": 1756836795019,
     "user": {
      "displayName": "Razvan Bunescu",
      "userId": "08159777761660776328"
     },
     "user_tz": 240
    },
    "id": "ca0788cc",
    "outputId": "a57c6143-4fc8-4c31-fd75-4437e2c8550a"
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[b'so', b'ooo', b' much', b' r', b'rr', b'r', b'acing', b' in', b' Kann', b'apolis', b' this', b' Summer', b'!']\n"
     ]
    }
   ],
   "source": [
    "# The .decode_single_token_bytes() method safely converts a single integer token to the bytes it represents.\n",
    "tokens = [enc.decode_single_token_bytes(token) for token in ltokens]\n",
    "print(tokens)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "0c53df40",
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "executionInfo": {
     "elapsed": 23,
     "status": "ok",
     "timestamp": 1756836881529,
     "user": {
      "displayName": "Razvan Bunescu",
      "userId": "08159777761660776328"
     },
     "user_tz": 240
    },
    "id": "0c53df40",
    "outputId": "b9f3fbb7-7647-4d6b-ee33-3ef1dfed5990"
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[b'so', b'ooo', b' much', b' r', b'rr', b'r', b'acing', b' in', b' Kann', b'apolis', b' this', b' Summer', b'!']\n"
     ]
    }
   ],
   "source": [
    "# We usually combine .encode() with .decode_single_token_bytes() into one list comprehension\n",
    "#    to get the list of tokens as byte strings.\n",
    "tokens = [enc.decode_single_token_bytes(token) for token in enc.encode(\"soooo much rrrracing in Kannapolis this Summer!\")]\n",
    "\n",
    "# Note the 'b' in front of each string, which means that the string you see is a sequence of bytes.\n",
    "print(tokens)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "lAomeqOCTYyJ",
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "executionInfo": {
     "elapsed": 26,
     "status": "ok",
     "timestamp": 1756836890016,
     "user": {
      "displayName": "Razvan Bunescu",
      "userId": "08159777761660776328"
     },
     "user_tz": 240
    },
    "id": "lAomeqOCTYyJ",
    "outputId": "83df025a-790a-470d-8d61-47fc66c71ff1"
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['so', 'ooo', ' much', ' r', 'rr', 'r', 'acing', ' in', ' Kann', 'apolis', ' this', ' Summer', '!']\n"
     ]
    }
   ],
   "source": [
    "# To translate to the standard representation (utf-8), you can use token.decode('utf-8').\n",
    "utf8_tokens = [token.decode('utf-8') for token in tokens]\n",
    "print(utf8_tokens)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "2e543c7b",
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "executionInfo": {
     "elapsed": 3,
     "status": "ok",
     "timestamp": 1756836920748,
     "user": {
      "displayName": "Razvan Bunescu",
      "userId": "08159777761660776328"
     },
     "user_tz": 240
    },
    "id": "2e543c7b",
    "outputId": "be125a5d-6fa7-4b47-bdd2-8eaac0c65daf"
   },
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "96de35c3",
   "metadata": {
    "id": "96de35c3"
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[b'I', b' think', b' what', b' she', b' said', b' is', b' so', b'ooo', b' cra', b'aa', b'azy', b'!']\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "'think'"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "tokens = [enc.decode_single_token_bytes(token) for token in enc.encode(\"I think what she said is soooo craaaazy!\")]\n",
    "print(tokens)\n",
    "\n",
    "tokens[1].strip().decode('utf-8')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "c1ea0e31",
   "metadata": {
    "id": "c1ea0e31"
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[b'The', b' perplex', b'ing', b' cat', b' sat', b' on', b' the', b' mat', b'.']\n",
      "['The', ' perplex', 'ing', ' cat', ' sat', ' on', ' the', ' mat', '.']\n"
     ]
    }
   ],
   "source": [
    "# Another example showing subword tokens.\n",
    "tokens = [enc.decode_single_token_bytes(token) for token in enc.encode(\"The perplexing cat sat on the mat.\")]\n",
    "print(tokens)\n",
    "\n",
    "utf8_tokens = [token.decode('utf-8') for token in tokens]\n",
    "print(utf8_tokens)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "164a1c9e-9d1a-4f58-9c1a-ab95d93d4800",
   "metadata": {},
   "source": [
    "## Tokenization of multiple lines of text\n",
    "\n",
    "Let's try `tiktoken` on the first stanza of [The Contract Drafting Em](https://secularsolstice.github.io/songs/Contract_Drafting_Em/gen/)."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "id": "6c174009",
   "metadata": {
    "id": "6c174009"
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "I am a contract-drafting em,\n",
      "The loyalest of lawyers!\n",
      "I draw up terms for deals 'twixt firms\n",
      "To service my employers!\n",
      "I like drafting poems.\n",
      " \n",
      "\n",
      "['I', ' am', ' a', ' contract', '-d', 'raft', 'ing', ' em', ',\\n', 'The', ' lo', 'y', 'ale', 'st', ' of', ' lawyers', '!\\n', 'I', ' draw', ' up', ' terms', ' for', ' deals', \" '\", 'tw', 'ix', 't', ' firms', '\\n', 'To', ' service', ' my', ' employers', '!\\n', 'I', ' like', ' drafting', ' poems', '.\\n']\n"
     ]
    }
   ],
   "source": [
    "stanza = \"I am a contract-drafting em,\\n\" \\\n",
    "  \"The loyalest of lawyers!\\n\" \\\n",
    "  \"I draw up terms for deals 'twixt firms\\n\" \\\n",
    "  \"To service my employers!\\n\" \\\n",
    "  \"I like drafting poems.\\n\"\n",
    "print(stanza, '\\n')\n",
    "\n",
    "tokens = [enc.decode_single_token_bytes(token) for token in enc.encode(stanza)]\n",
    "utf8_tokens = [token.decode('utf-8') for token in tokens]\n",
    "print(utf8_tokens)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "id": "917b00f3-9a9d-41b7-83ac-c4161ffa109b",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[b'His', b' \\t', b' \\n \\n', b'\\t     ', b'\\t ', b' amb', b'ivalence', b' was', b'  ', b' perplex', b'ing', b'.']\n"
     ]
    }
   ],
   "source": [
    "# Let's see how tiktoken deals with different types of white space.\n",
    "tokens = [enc.decode_single_token_bytes(token) for token in enc.encode(\"His \\t \\n \\n\\t     \\t  ambivalence was   perplexing.\")]\n",
    "#tokens = enc.decode_single_token_bytes(enc.encode(\"His ambivalence was perplexing.\"))\n",
    "\n",
    "print(tokens)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "11ff5975-d9eb-46b8-8bec-2dd05b00ac14",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "colab": {
   "provenance": []
  },
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.12.4"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}