{ "cells": [ { "cell_type": "markdown", "id": "5b251563", "metadata": { "id": "5b251563" }, "source": [ "# BPE Tokenization using `tiktoken`\n", "\n", "**BPE** tokenization is done to using statistics over a large corpus of text in order to determine how strings should be segmented into *subword* tokens. Here we use the `tiktoken` tokenizer from OpenAI." ] }, { "cell_type": "code", "execution_count": 1, "id": "0f986314", "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "executionInfo": { "elapsed": 13, "status": "ok", "timestamp": 1756836696317, "user": { "displayName": "Razvan Bunescu", "userId": "08159777761660776328" }, "user_tz": 240 }, "id": "0f986314", "outputId": "c0672edd-587d-4204-b442-468af1382729" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[708, 39721, 1790, 436, 637, 81, 4628, 304, 78311, 24751, 420, 19367, 0]\n" ] } ], "source": [ "# Uncomment this line if tiktoken is not yet installed on your machine.\n", "#!pip install tiktoken\n", "\n", "import tiktoken\n", "\n", "# To get the tokenizer corresponding to a specific model in the OpenAI API:\n", "enc = tiktoken.encoding_for_model(\"gpt-4\")\n", "\n", "# The .encode() method converts a text string into a list of token integers.\n", "ltokens = enc.encode(\"soooo much rrrracing in Kannapolis this Summer!\")\n", "print(ltokens)" ] }, { "cell_type": "code", "execution_count": 2, "id": "3cadd5c7", "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 35 }, "executionInfo": { "elapsed": 6, "status": "ok", "timestamp": 1756836770758, "user": { "displayName": "Razvan Bunescu", "userId": "08159777761660776328" }, "user_tz": 240 }, "id": "3cadd5c7", "outputId": "f2059859-8bc0-4678-899c-b555f0aff1ed" }, "outputs": [ { "data": { "text/plain": [ "'soooo much rrrracing in Kannapolis this Summer!'" ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# The .decode() method converts a list of token integers to a string.\n", "enc.decode(ltokens)" ] }, { "cell_type": "code", "execution_count": 3, "id": "ca0788cc", "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "executionInfo": { "elapsed": 4, "status": "ok", "timestamp": 1756836795019, "user": { "displayName": "Razvan Bunescu", "userId": "08159777761660776328" }, "user_tz": 240 }, "id": "ca0788cc", "outputId": "a57c6143-4fc8-4c31-fd75-4437e2c8550a" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[b'so', b'ooo', b' much', b' r', b'rr', b'r', b'acing', b' in', b' Kann', b'apolis', b' this', b' Summer', b'!']\n" ] } ], "source": [ "# The .decode_single_token_bytes() method safely converts a single integer token to the bytes it represents.\n", "tokens = [enc.decode_single_token_bytes(token) for token in ltokens]\n", "print(tokens)" ] }, { "cell_type": "code", "execution_count": 4, "id": "0c53df40", "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "executionInfo": { "elapsed": 23, "status": "ok", "timestamp": 1756836881529, "user": { "displayName": "Razvan Bunescu", "userId": "08159777761660776328" }, "user_tz": 240 }, "id": "0c53df40", "outputId": "b9f3fbb7-7647-4d6b-ee33-3ef1dfed5990" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[b'so', b'ooo', b' much', b' r', b'rr', b'r', b'acing', b' in', b' Kann', b'apolis', b' this', b' Summer', b'!']\n" ] } ], "source": [ "# We usually combine .encode() with .decode_single_token_bytes() into one list comprehension\n", "# to get the list of tokens as byte strings.\n", "tokens = [enc.decode_single_token_bytes(token) for token in enc.encode(\"soooo much rrrracing in Kannapolis this Summer!\")]\n", "\n", "# Note the 'b' in front of each string, which means that the string you see is a sequence of bytes.\n", "print(tokens)" ] }, { "cell_type": "code", "execution_count": 5, "id": "lAomeqOCTYyJ", "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "executionInfo": { "elapsed": 26, "status": "ok", "timestamp": 1756836890016, "user": { "displayName": "Razvan Bunescu", "userId": "08159777761660776328" }, "user_tz": 240 }, "id": "lAomeqOCTYyJ", "outputId": "83df025a-790a-470d-8d61-47fc66c71ff1" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "['so', 'ooo', ' much', ' r', 'rr', 'r', 'acing', ' in', ' Kann', 'apolis', ' this', ' Summer', '!']\n" ] } ], "source": [ "# To translate to the standard representation (utf-8), you can use token.decode('utf-8').\n", "utf8_tokens = [token.decode('utf-8') for token in tokens]\n", "print(utf8_tokens)" ] }, { "cell_type": "code", "execution_count": null, "id": "2e543c7b", "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "executionInfo": { "elapsed": 3, "status": "ok", "timestamp": 1756836920748, "user": { "displayName": "Razvan Bunescu", "userId": "08159777761660776328" }, "user_tz": 240 }, "id": "2e543c7b", "outputId": "be125a5d-6fa7-4b47-bdd2-8eaac0c65daf" }, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": 6, "id": "96de35c3", "metadata": { "id": "96de35c3" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[b'I', b' think', b' what', b' she', b' said', b' is', b' so', b'ooo', b' cra', b'aa', b'azy', b'!']\n" ] }, { "data": { "text/plain": [ "'think'" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "tokens = [enc.decode_single_token_bytes(token) for token in enc.encode(\"I think what she said is soooo craaaazy!\")]\n", "print(tokens)\n", "\n", "tokens[1].strip().decode('utf-8')" ] }, { "cell_type": "code", "execution_count": 7, "id": "c1ea0e31", "metadata": { "id": "c1ea0e31" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[b'The', b' perplex', b'ing', b' cat', b' sat', b' on', b' the', b' mat', b'.']\n", "['The', ' perplex', 'ing', ' cat', ' sat', ' on', ' the', ' mat', '.']\n" ] } ], "source": [ "# Another example showing subword tokens.\n", "tokens = [enc.decode_single_token_bytes(token) for token in enc.encode(\"The perplexing cat sat on the mat.\")]\n", "print(tokens)\n", "\n", "utf8_tokens = [token.decode('utf-8') for token in tokens]\n", "print(utf8_tokens)" ] }, { "cell_type": "markdown", "id": "164a1c9e-9d1a-4f58-9c1a-ab95d93d4800", "metadata": {}, "source": [ "## Tokenization of multiple lines of text\n", "\n", "Let's try `tiktoken` on the first stanza of [The Contract Drafting Em](https://secularsolstice.github.io/songs/Contract_Drafting_Em/gen/)." ] }, { "cell_type": "code", "execution_count": 13, "id": "6c174009", "metadata": { "id": "6c174009" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "I am a contract-drafting em,\n", "The loyalest of lawyers!\n", "I draw up terms for deals 'twixt firms\n", "To service my employers!\n", "I like drafting poems.\n", " \n", "\n", "['I', ' am', ' a', ' contract', '-d', 'raft', 'ing', ' em', ',\\n', 'The', ' lo', 'y', 'ale', 'st', ' of', ' lawyers', '!\\n', 'I', ' draw', ' up', ' terms', ' for', ' deals', \" '\", 'tw', 'ix', 't', ' firms', '\\n', 'To', ' service', ' my', ' employers', '!\\n', 'I', ' like', ' drafting', ' poems', '.\\n']\n" ] } ], "source": [ "stanza = \"I am a contract-drafting em,\\n\" \\\n", " \"The loyalest of lawyers!\\n\" \\\n", " \"I draw up terms for deals 'twixt firms\\n\" \\\n", " \"To service my employers!\\n\" \\\n", " \"I like drafting poems.\\n\"\n", "print(stanza, '\\n')\n", "\n", "tokens = [enc.decode_single_token_bytes(token) for token in enc.encode(stanza)]\n", "utf8_tokens = [token.decode('utf-8') for token in tokens]\n", "print(utf8_tokens)" ] }, { "cell_type": "code", "execution_count": 12, "id": "917b00f3-9a9d-41b7-83ac-c4161ffa109b", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[b'His', b' \\t', b' \\n \\n', b'\\t ', b'\\t ', b' amb', b'ivalence', b' was', b' ', b' perplex', b'ing', b'.']\n" ] } ], "source": [ "# Let's see how tiktoken deals with different types of white space.\n", "tokens = [enc.decode_single_token_bytes(token) for token in enc.encode(\"His \\t \\n \\n\\t \\t ambivalence was perplexing.\")]\n", "#tokens = enc.decode_single_token_bytes(enc.encode(\"His ambivalence was perplexing.\"))\n", "\n", "print(tokens)" ] }, { "cell_type": "code", "execution_count": null, "id": "11ff5975-d9eb-46b8-8bec-2dd05b00ac14", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "colab": { "provenance": [] }, "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.12.4" } }, "nbformat": 4, "nbformat_minor": 5 }