diff --git a/04-text-byte/.gitignore b/04-text-byte/.gitignore index 421376d..b9ad381 100644 --- a/04-text-byte/.gitignore +++ b/04-text-byte/.gitignore @@ -1 +1,2 @@ dummy +cafe.txt diff --git a/04-text-byte/04-text-byte.ipynb b/04-text-byte/04-text-byte.ipynb new file mode 100644 index 0000000..eb566fc --- /dev/null +++ b/04-text-byte/04-text-byte.ipynb @@ -0,0 +1,2238 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "9bbc5481", + "metadata": {}, + "source": [ + "# Chapter 4 — Unicode Text Versus Bytes" + ] + }, + { + "cell_type": "markdown", + "id": "ca979304", + "metadata": {}, + "source": [ + "## Character Issues" + ] + }, + { + "cell_type": "markdown", + "id": "dcbc6239", + "metadata": {}, + "source": [ + "#### Example 4-1. Encoding and decoding" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "6a3b3a4f", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "4" + ] + }, + "execution_count": 1, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "s = 'café'\n", + "len(s)" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "53523bce", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "b'caf\\xc3\\xa9'" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "b = s.encode('utf8')\n", + "b" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "6ba8b10d", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "5" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "len(b)" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "92f1f056", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'café'" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "b.decode('utf8')" + ] + }, + { + "cell_type": "markdown", + "id": "f9a606ee", + "metadata": {}, + "source": [ + "## Byte Essentials" + ] + }, + { + "cell_type": "markdown", + "id": "7c1ec7f1", + "metadata": {}, + "source": [ + "#### Example 4-2. A five-byte sequence as `bytes` and as `bytearray`" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "3992eecc", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "b'caf\\xc3\\xa9'" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "cafe = bytes('café', encoding='utf_8')\n", + "cafe" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "c9dbce26", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "99" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "cafe[0]" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "cbe46c5b", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "b'c'" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "cafe[:1]" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "d29feb33", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "bytearray(b'caf\\xc3\\xa9')" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "cafe_arr = bytearray(cafe)\n", + "cafe_arr" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "dd6fb270", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "bytearray(b'\\xa9')" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "cafe_arr[-1:]" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "cf5f74b1", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "b'1K\\xce\\xa9'" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "bytes.fromhex('31 4B CE A9')" + ] + }, + { + "cell_type": "markdown", + "id": "d9ff5124", + "metadata": {}, + "source": [ + "#### Example 4-3. Initializing bytes from the raw data of an array" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "fe69c5fd", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "b'\\xfe\\xff\\xff\\xff\\x00\\x00\\x01\\x00\\x02\\x00'" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import array\n", + "numbers = array.array('h', [-2, -1, 0, 1, 2])\n", + "octets = bytes(numbers)\n", + "octets" + ] + }, + { + "cell_type": "markdown", + "id": "7bf1245c", + "metadata": {}, + "source": [ + "### Basic Encoders/Decoders" + ] + }, + { + "cell_type": "markdown", + "id": "ff4ae5c5", + "metadata": {}, + "source": [ + "#### Example 4-4. The string “El Niño” encoded with three codecs producing very different byte sequences" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "60c8b066", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "latin_1\tb'El Ni\\xf1o'\n", + "utf_8\tb'El Ni\\xc3\\xb1o'\n", + "utf_16\tb'\\xff\\xfeE\\x00l\\x00 \\x00N\\x00i\\x00\\xf1\\x00o\\x00'\n" + ] + } + ], + "source": [ + "for codec in ['latin_1', 'utf_8', 'utf_16']:\n", + " print(codec, 'El Niño'.encode(codec), sep='\\t')" + ] + }, + { + "cell_type": "markdown", + "id": "0204104d", + "metadata": {}, + "source": [ + "## Understanding Encode/Decode Problems" + ] + }, + { + "cell_type": "markdown", + "id": "0547bdf0", + "metadata": {}, + "source": [ + "### Coping with UnicodeEncode Error" + ] + }, + { + "cell_type": "markdown", + "id": "b9edceb8", + "metadata": {}, + "source": [ + "#### Example 4-5. Encoding to bytes: success and error handling" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "7aa1d383", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "b'S\\xc3\\xa3o Paulo'" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "city = 'São Paulo'\n", + "city.encode('utf_8')" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "69e2ea38", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "b'\\xff\\xfeS\\x00\\xe3\\x00o\\x00 \\x00P\\x00a\\x00u\\x00l\\x00o\\x00'" + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "city.encode('utf_16')" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "8cf10d99", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "b'S\\xe3o Paulo'" + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "city.encode('iso8859_1')" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "0a0becf5", + "metadata": { + "tags": [ + "raises-exception" + ] + }, + "outputs": [ + { + "ename": "UnicodeEncodeError", + "evalue": "'charmap' codec can't encode character '\\xe3' in position 1: character maps to ", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mUnicodeEncodeError\u001b[0m Traceback (most recent call last)", + "Input \u001b[0;32mIn [16]\u001b[0m, in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0m \u001b[43mcity\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mencode\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mcp437\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m)\u001b[49m\n", + "File \u001b[0;32m~/anaconda3/envs/fluent-python/lib/python3.10/encodings/cp437.py:12\u001b[0m, in \u001b[0;36mCodec.encode\u001b[0;34m(self, input, errors)\u001b[0m\n\u001b[1;32m 11\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mencode\u001b[39m(\u001b[38;5;28mself\u001b[39m,\u001b[38;5;28minput\u001b[39m,errors\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mstrict\u001b[39m\u001b[38;5;124m'\u001b[39m):\n\u001b[0;32m---> 12\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mcodecs\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcharmap_encode\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43minput\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43merrors\u001b[49m\u001b[43m,\u001b[49m\u001b[43mencoding_map\u001b[49m\u001b[43m)\u001b[49m\n", + "\u001b[0;31mUnicodeEncodeError\u001b[0m: 'charmap' codec can't encode character '\\xe3' in position 1: character maps to " + ] + } + ], + "source": [ + "city.encode('cp437')" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "id": "fe6eae98", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "b'So Paulo'" + ] + }, + "execution_count": 17, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "city.encode('cp437', errors='ignore')" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "id": "11e913e7", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "b'S?o Paulo'" + ] + }, + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "city.encode('cp437', errors='replace')" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "id": "c6e189b3", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "b'São Paulo'" + ] + }, + "execution_count": 19, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "city.encode('cp437', errors='xmlcharrefreplace')" + ] + }, + { + "cell_type": "markdown", + "id": "cd350b72", + "metadata": {}, + "source": [ + "### Coping with UnicodeDecodeError" + ] + }, + { + "cell_type": "markdown", + "id": "cabd0d66", + "metadata": {}, + "source": [ + "#### Example 4-6. Decoding from str to bytes: success and error handling" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "id": "7f1faf3c", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'Montréal'" + ] + }, + "execution_count": 20, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "octets = b'Montr\\xe9al'\n", + "octets.decode('cp1252')" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "id": "8fd19098", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'Montrιal'" + ] + }, + "execution_count": 21, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "octets.decode('iso8859_7')" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "id": "a3d75506", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'MontrИal'" + ] + }, + "execution_count": 22, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "octets.decode('koi8_r')" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "id": "a245459d", + "metadata": { + "tags": [ + "raises-exception" + ] + }, + "outputs": [ + { + "ename": "UnicodeDecodeError", + "evalue": "'utf-8' codec can't decode byte 0xe9 in position 5: invalid continuation byte", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mUnicodeDecodeError\u001b[0m Traceback (most recent call last)", + "Input \u001b[0;32mIn [23]\u001b[0m, in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0m \u001b[43moctets\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mdecode\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mutf_8\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m)\u001b[49m\n", + "\u001b[0;31mUnicodeDecodeError\u001b[0m: 'utf-8' codec can't decode byte 0xe9 in position 5: invalid continuation byte" + ] + } + ], + "source": [ + "octets.decode('utf_8')" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "id": "52bcaa93", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'Montr�al'" + ] + }, + "execution_count": 24, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "octets.decode('utf_8', errors='replace')" + ] + }, + { + "cell_type": "markdown", + "id": "601a4482", + "metadata": {}, + "source": [ + "### SyntaxError when Loading Modules with Unexpected Encoding" + ] + }, + { + "cell_type": "markdown", + "id": "b5de81cc", + "metadata": {}, + "source": [ + "#### Example 4-7. [ola.py](ola.py): “Hello, World!” in Portuguese" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "id": "7edc2d56", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Olá, Mundo!\r\n" + ] + } + ], + "source": [ + "!python3 ola.py" + ] + }, + { + "cell_type": "markdown", + "id": "a38ba9db", + "metadata": {}, + "source": [ + "### BOM: A Useful Gremlin" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "id": "ce7d9a63", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "b'\\xff\\xfeE\\x00l\\x00 \\x00N\\x00i\\x00\\xf1\\x00o\\x00'" + ] + }, + "execution_count": 26, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "u16 = 'El Niño'.encode('utf_16')\n", + "u16" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "id": "1969bff8", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[255, 254, 69, 0, 108, 0, 32, 0, 78, 0, 105, 0, 241, 0, 111, 0]" + ] + }, + "execution_count": 27, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "list(u16)" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "id": "6915d186", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[69, 0, 108, 0, 32, 0, 78, 0, 105, 0, 241, 0, 111, 0]" + ] + }, + "execution_count": 28, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "u16le = 'El Niño'.encode('utf_16le')\n", + "list(u16le)" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "id": "ab36987f", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[0, 69, 0, 108, 0, 32, 0, 78, 0, 105, 0, 241, 0, 111]" + ] + }, + "execution_count": 29, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "u16be = 'El Niño'.encode('utf_16be')\n", + "list(u16be)" + ] + }, + { + "cell_type": "markdown", + "id": "c988acf3", + "metadata": {}, + "source": [ + "## Handling Text Files" + ] + }, + { + "cell_type": "markdown", + "id": "f75ef2a6", + "metadata": {}, + "source": [ + "#### Example 4-8. A platform encoding issue" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "id": "8b3e254a", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "4" + ] + }, + "execution_count": 30, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "open('cafe.txt', 'w', encoding='utf_8').write('café')" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "id": "2060dc22", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'café'" + ] + }, + "execution_count": 31, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Note: We are forcing the bug by assigning the encoding 'cp1252' in this example\n", + "open('cafe.txt', encoding='cp1252').read()" + ] + }, + { + "cell_type": "markdown", + "id": "8854f3ae", + "metadata": {}, + "source": [ + "#### Example 4-9. Closer inspection of Example 4-8 running on Windows reveals the bug and how to fix it" + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "id": "01032ff2", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "<_io.TextIOWrapper name='cafe.txt' mode='w' encoding='utf_8'>" + ] + }, + "execution_count": 32, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "fp = open('cafe.txt', 'w', encoding='utf_8')\n", + "fp" + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "id": "4e09658b", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "4" + ] + }, + "execution_count": 33, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "fp.write('café')" + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "id": "df0be9ad", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "5" + ] + }, + "execution_count": 34, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "fp.close()\n", + "import os\n", + "os.stat('cafe.txt').st_size" + ] + }, + { + "cell_type": "code", + "execution_count": 35, + "id": "3cbb9923", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "<_io.TextIOWrapper name='cafe.txt' mode='r' encoding='cp1252'>" + ] + }, + "execution_count": 35, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Note: We are forcing the issue by assigning the encoding 'cp1252' in this example\n", + "fp2 = open('cafe.txt', encoding='cp1252')\n", + "fp2" + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "id": "6a73e37f", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'cp1252'" + ] + }, + "execution_count": 36, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "fp2.encoding" + ] + }, + { + "cell_type": "code", + "execution_count": 37, + "id": "40648ac6", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "<_io.TextIOWrapper name='cafe.txt' mode='r' encoding='utf_8'>" + ] + }, + "execution_count": 37, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "fp3 = open('cafe.txt', encoding='utf_8')\n", + "fp3" + ] + }, + { + "cell_type": "code", + "execution_count": 38, + "id": "987b598d", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'café'" + ] + }, + "execution_count": 38, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "fp3.read()" + ] + }, + { + "cell_type": "code", + "execution_count": 39, + "id": "68ebf58c", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "<_io.BufferedReader name='cafe.txt'>" + ] + }, + "execution_count": 39, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "fp4 = open('cafe.txt', 'rb')\n", + "fp4" + ] + }, + { + "cell_type": "code", + "execution_count": 40, + "id": "e16c94e0", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "b'caf\\xc3\\xa9'" + ] + }, + "execution_count": 40, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "fp4.read()" + ] + }, + { + "cell_type": "markdown", + "id": "be0f460d", + "metadata": {}, + "source": [ + "### Beware of Encoding Defaults" + ] + }, + { + "cell_type": "markdown", + "id": "b73e7809", + "metadata": {}, + "source": [ + "#### Example 4-10. Exploring encoding defaults: [default_encodings.py](default_encodings.py)" + ] + }, + { + "cell_type": "code", + "execution_count": 41, + "id": "e2707f00", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " locale.getpreferredencoding() -> 'UTF-8'\r\n", + " type(my_file) -> \r\n", + " my_file.encoding -> 'UTF-8'\r\n", + " sys.stdout.isatty() -> True\r\n", + " sys.stdout.encoding -> 'utf-8'\r\n", + " sys.stdin.isatty() -> True\r\n", + " sys.stdin.encoding -> 'utf-8'\r\n", + " sys.stderr.isatty() -> True\r\n", + " sys.stderr.encoding -> 'utf-8'\r\n", + " sys.getdefaultencoding() -> 'utf-8'\r\n", + " sys.getfilesystemencoding() -> 'utf-8'\r\n" + ] + } + ], + "source": [ + "!python default_encodings.py # Unix machine output" + ] + }, + { + "cell_type": "markdown", + "id": "51a80f0f", + "metadata": {}, + "source": [ + "#### Example 4-12. [stdout_check.py](stdout_check.py)" + ] + }, + { + "cell_type": "code", + "execution_count": 42, + "id": "4a8166e7", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "3.10.4 (main, Mar 31 2022, 08:41:55) [GCC 7.5.0]\r\n", + "\r\n", + "sys.stdout.isatty(): True\r\n", + "sys.stdout.encoding: utf-8\r\n", + "\r\n", + "Trying to output HORIZONTAL ELLIPSIS:\r\n", + "…\r\n", + "Trying to output INFINITY:\r\n", + "∞\r\n", + "Trying to output CIRCLED NUMBER FORTY TWO:\r\n", + "㊷\r\n" + ] + } + ], + "source": [ + "!python stdout_check.py # Unix machine output" + ] + }, + { + "cell_type": "markdown", + "id": "60bada3a", + "metadata": {}, + "source": [ + "## Normalizing Unicode for Reliable Comparisons" + ] + }, + { + "cell_type": "code", + "execution_count": 43, + "id": "09177041", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "('café', 'café')" + ] + }, + "execution_count": 43, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "s1 = 'café'\n", + "s2 = 'cafe\\N{COMBINING ACUTE ACCENT}'\n", + "s1, s2" + ] + }, + { + "cell_type": "code", + "execution_count": 44, + "id": "c4140d32", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(4, 5)" + ] + }, + "execution_count": 44, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "len(s1), len(s2)" + ] + }, + { + "cell_type": "code", + "execution_count": 45, + "id": "2a8c687f", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "False" + ] + }, + "execution_count": 45, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "s1 == s2" + ] + }, + { + "cell_type": "code", + "execution_count": 46, + "id": "6dd29d8d", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(4, 5)" + ] + }, + "execution_count": 46, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from unicodedata import normalize\n", + "s1 = 'café'\n", + "s2 = 'cafe\\N{COMBINING ACUTE ACCENT}'\n", + "len(s1), len(s2)" + ] + }, + { + "cell_type": "code", + "execution_count": 47, + "id": "2abcf125", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(4, 4)" + ] + }, + "execution_count": 47, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "len(normalize('NFC', s1)), len(normalize('NFC', s2))" + ] + }, + { + "cell_type": "code", + "execution_count": 48, + "id": "be5825d7", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(5, 5)" + ] + }, + "execution_count": 48, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "len(normalize('NFD', s1)), len(normalize('NFD', s2))" + ] + }, + { + "cell_type": "code", + "execution_count": 49, + "id": "dafcd872", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "True" + ] + }, + "execution_count": 49, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "normalize('NFC', s1) == normalize('NFC', s2)" + ] + }, + { + "cell_type": "code", + "execution_count": 50, + "id": "f54dd326", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "True" + ] + }, + "execution_count": 50, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "normalize('NFD', s1) == normalize('NFD', s2)" + ] + }, + { + "cell_type": "code", + "execution_count": 51, + "id": "2cbde3b5", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'OHM SIGN'" + ] + }, + "execution_count": 51, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from unicodedata import name\n", + "ohm = '\\u2126'\n", + "name(ohm)" + ] + }, + { + "cell_type": "code", + "execution_count": 52, + "id": "76f3b515", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'GREEK CAPITAL LETTER OMEGA'" + ] + }, + "execution_count": 52, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "ohm_c = normalize('NFC', ohm)\n", + "name(ohm_c)" + ] + }, + { + "cell_type": "code", + "execution_count": 53, + "id": "fb2f0252", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "False" + ] + }, + "execution_count": 53, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "ohm == ohm_c" + ] + }, + { + "cell_type": "code", + "execution_count": 54, + "id": "d047eba9", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "True" + ] + }, + "execution_count": 54, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "normalize('NFC', ohm) == normalize('NFC', ohm_c)" + ] + }, + { + "cell_type": "code", + "execution_count": 55, + "id": "0196edb7", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "½\n" + ] + } + ], + "source": [ + "half = '\\N{VULGAR FRACTION ONE HALF}'\n", + "print(half)" + ] + }, + { + "cell_type": "code", + "execution_count": 56, + "id": "086f76a9", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'1⁄2'" + ] + }, + "execution_count": 56, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "normalize('NFKC', half)" + ] + }, + { + "cell_type": "code", + "execution_count": 57, + "id": "bf79ba8e", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "1\tDIGIT ONE\n", + "⁄\tFRACTION SLASH\n", + "2\tDIGIT TWO\n" + ] + } + ], + "source": [ + "for char in normalize('NFKC', half):\n", + " print(char, name(char), sep='\\t')" + ] + }, + { + "cell_type": "code", + "execution_count": 58, + "id": "60e5a57a", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'42'" + ] + }, + "execution_count": 58, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "four_squared = '4²'\n", + "normalize('NFKC', four_squared)" + ] + }, + { + "cell_type": "code", + "execution_count": 59, + "id": "72ce42f5", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "('µ', 'μ')" + ] + }, + "execution_count": 59, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "micro = 'µ'\n", + "micro_kc = normalize('NFKC', micro)\n", + "micro, micro_kc" + ] + }, + { + "cell_type": "code", + "execution_count": 60, + "id": "c5d23eb6", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(181, 956)" + ] + }, + "execution_count": 60, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "ord(micro), ord(micro_kc)" + ] + }, + { + "cell_type": "code", + "execution_count": 61, + "id": "63e672c9", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "('MICRO SIGN', 'GREEK SMALL LETTER MU')" + ] + }, + "execution_count": 61, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "name(micro), name(micro_kc)" + ] + }, + { + "cell_type": "markdown", + "id": "18f6e399", + "metadata": {}, + "source": [ + "### Case Folding" + ] + }, + { + "cell_type": "code", + "execution_count": 62, + "id": "99edfd13", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'MICRO SIGN'" + ] + }, + "execution_count": 62, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "micro = 'µ'\n", + "name(micro)" + ] + }, + { + "cell_type": "code", + "execution_count": 63, + "id": "9f8c6938", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'GREEK SMALL LETTER MU'" + ] + }, + "execution_count": 63, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "micro_cf = micro.casefold()\n", + "name(micro_cf)" + ] + }, + { + "cell_type": "code", + "execution_count": 64, + "id": "a1eb1706", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "('µ', 'μ')" + ] + }, + "execution_count": 64, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "micro, micro_cf" + ] + }, + { + "cell_type": "code", + "execution_count": 65, + "id": "175f9b49", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'LATIN SMALL LETTER SHARP S'" + ] + }, + "execution_count": 65, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "eszett = 'ß'\n", + "name(eszett)" + ] + }, + { + "cell_type": "code", + "execution_count": 66, + "id": "8e80d6c2", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "('ß', 'ss')" + ] + }, + "execution_count": 66, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "eszett_cf = eszett.casefold()\n", + "eszett, eszett_cf" + ] + }, + { + "cell_type": "markdown", + "id": "43a3a7d9", + "metadata": {}, + "source": [ + "### Utility Functions for Normalized Text Matching" + ] + }, + { + "cell_type": "markdown", + "id": "6d1e8f67", + "metadata": {}, + "source": [ + "#### Example 4-13. [normeq.py](normeq.py): normalized Unicode string comparison" + ] + }, + { + "cell_type": "code", + "execution_count": 67, + "id": "ca8c7d47", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "False" + ] + }, + "execution_count": 67, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from normeq import nfc_equal\n", + "s1 = 'café'\n", + "s2 = 'cafe\\u0301'\n", + "s1 == s2" + ] + }, + { + "cell_type": "code", + "execution_count": 68, + "id": "e7f35dfc", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "True" + ] + }, + "execution_count": 68, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "nfc_equal(s1, s2)" + ] + }, + { + "cell_type": "code", + "execution_count": 69, + "id": "69e21e21", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "False" + ] + }, + "execution_count": 69, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "nfc_equal('A', 'a')" + ] + }, + { + "cell_type": "code", + "execution_count": 70, + "id": "0ee92f88", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "False" + ] + }, + "execution_count": 70, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from normeq import fold_equal\n", + "s3 = 'Straße'\n", + "s4 = 'strasse'\n", + "s3 == s4" + ] + }, + { + "cell_type": "code", + "execution_count": 71, + "id": "f0477359", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "False" + ] + }, + "execution_count": 71, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "nfc_equal(s3, s4)" + ] + }, + { + "cell_type": "code", + "execution_count": 72, + "id": "395d4b33", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "True" + ] + }, + "execution_count": 72, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "fold_equal(s3, s4)" + ] + }, + { + "cell_type": "code", + "execution_count": 73, + "id": "c66f0238", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "True" + ] + }, + "execution_count": 73, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "fold_equal(s1, s2)" + ] + }, + { + "cell_type": "code", + "execution_count": 74, + "id": "14157966", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "True" + ] + }, + "execution_count": 74, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "fold_equal('A', 'a')" + ] + }, + { + "cell_type": "markdown", + "id": "af41b1ad", + "metadata": {}, + "source": [ + "### Extreme “Normalization”: Taking Out Diacritics" + ] + }, + { + "cell_type": "markdown", + "id": "0c8c2c61", + "metadata": {}, + "source": [ + "#### Example 4-14. [simplify.py](simplify.py): function to remove all combining marks" + ] + }, + { + "cell_type": "code", + "execution_count": 75, + "id": "0506ed43", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'“Herr Voß: • ½ cup of Œtker™ caffe latte • bowl of acai.”'" + ] + }, + "execution_count": 75, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from simplify import shave_marks\n", + "order = '“Herr Voß: • ½ cup of Œtker™ caffè latte • bowl of açaí.”'\n", + "shave_marks(order)" + ] + }, + { + "cell_type": "code", + "execution_count": 76, + "id": "b074ef2e", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'Ζεφυρος, Zefiro'" + ] + }, + "execution_count": 76, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "Greek = 'Ζέφυρος, Zéfiro'\n", + "shave_marks(Greek)" + ] + }, + { + "cell_type": "markdown", + "id": "afe70e19", + "metadata": {}, + "source": [ + "#### Example 4-16. Function to remove combining marks from Latin characters" + ] + }, + { + "cell_type": "code", + "execution_count": 77, + "id": "1b9c02a5", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'“Herr Voß: • ½ cup of Œtker™ caffe latte • bowl of acai.”'" + ] + }, + "execution_count": 77, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from simplify import shave_marks_latin\n", + "shave_marks_latin(order)" + ] + }, + { + "cell_type": "code", + "execution_count": 78, + "id": "a00b8761", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'Ζέφυρος, Zefiro'" + ] + }, + "execution_count": 78, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "shave_marks_latin(Greek)" + ] + }, + { + "cell_type": "markdown", + "id": "5300f138", + "metadata": {}, + "source": [ + "#### Example 4-17. Transform some Western typographical symbols into ASCII" + ] + }, + { + "cell_type": "code", + "execution_count": 79, + "id": "d230e514", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'\"Herr Voß: - ½ cup of OEtker(TM) caffè latte - bowl of açaí.\"'" + ] + }, + "execution_count": 79, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from simplify import dewinize, asciize\n", + "dewinize(order)" + ] + }, + { + "cell_type": "code", + "execution_count": 80, + "id": "40c30bd2", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'\"Herr Voss: - 1⁄2 cup of OEtker(TM) caffe latte - bowl of acai.\"'" + ] + }, + "execution_count": 80, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "asciize(order)" + ] + }, + { + "cell_type": "markdown", + "id": "55173114", + "metadata": {}, + "source": [ + "## Sorting Unicode Text" + ] + }, + { + "cell_type": "code", + "execution_count": 81, + "id": "28d9bdff", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['acerola', 'atemoia', 'açaí', 'caju', 'cajá']" + ] + }, + "execution_count": 81, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "fruits = ['caju', 'atemoia', 'cajá', 'açaí', 'acerola']\n", + "sorted(fruits)\n", + "# Ideal result should be: ['açaí', 'acerola', 'atemoia', 'cajá', 'caju']" + ] + }, + { + "cell_type": "markdown", + "id": "e83c363f", + "metadata": {}, + "source": [ + "\n", + "#### Example 4-19. [locale_sort.py](locale_sort.py): using the `locale.strxfrm` function as the sort key\n", + "\n", + "**Note**: Requires the locale '`pt_BR.UTF-8`' to be installed " + ] + }, + { + "cell_type": "code", + "execution_count": 82, + "id": "c5bb098a", + "metadata": {}, + "outputs": [], + "source": [ + "# !python locale_sort.py" + ] + }, + { + "cell_type": "markdown", + "id": "6feb4d63", + "metadata": {}, + "source": [ + "### Sorting with the Unicode Collation Algorithm" + ] + }, + { + "cell_type": "markdown", + "id": "c6c3d053", + "metadata": {}, + "source": [ + "#### Example 4-20. Using the pyuca.Collator.sort_key method" + ] + }, + { + "cell_type": "code", + "execution_count": 83, + "id": "347d71a5", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['açaí', 'acerola', 'atemoia', 'cajá', 'caju']" + ] + }, + "execution_count": 83, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import pyuca\n", + "coll = pyuca.Collator()\n", + "sorted_fruits = sorted(fruits, key=coll.sort_key)\n", + "sorted_fruits" + ] + }, + { + "cell_type": "markdown", + "id": "df4396ea", + "metadata": {}, + "source": [ + "## The Unicode Database" + ] + }, + { + "cell_type": "markdown", + "id": "38e8db95", + "metadata": {}, + "source": [ + "### Finding Characters by Name" + ] + }, + { + "cell_type": "markdown", + "id": "6d6278c3", + "metadata": {}, + "source": [ + "#### Example 4-21. [cf.py](cf.py): the character finder utility" + ] + }, + { + "cell_type": "code", + "execution_count": 84, + "id": "9d3ff51b", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "U+1F638\t😸\tGRINNING CAT FACE WITH SMILING EYES\r\n", + "U+1F63A\t😺\tSMILING CAT FACE WITH OPEN MOUTH\r\n", + "U+1F63B\t😻\tSMILING CAT FACE WITH HEART-SHAPED EYES\r\n" + ] + } + ], + "source": [ + "!python3 charfinder/cf.py smiling cat" + ] + }, + { + "cell_type": "markdown", + "id": "f7913801", + "metadata": {}, + "source": [ + "### Numeric Meaning of Characters" + ] + }, + { + "cell_type": "markdown", + "id": "fe0cc8e5", + "metadata": {}, + "source": [ + "#### Example 4-22. [numerics_demo.py](numerics_demo.py): Demo of Unicode database numerical character metadata (callouts describe each column in the output)" + ] + }, + { + "cell_type": "code", + "execution_count": 85, + "id": "86ab5e1e", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "U+0031\t 1 \tre_dig\tisdig\tisnum\t 1.00\tDIGIT ONE\r\n", + "U+00bc\t ¼ \t-\t-\tisnum\t 0.25\tVULGAR FRACTION ONE QUARTER\r\n", + "U+00b2\t ² \t-\tisdig\tisnum\t 2.00\tSUPERSCRIPT TWO\r\n", + "U+0969\t ३ \tre_dig\tisdig\tisnum\t 3.00\tDEVANAGARI DIGIT THREE\r\n", + "U+136b\t ፫ \t-\tisdig\tisnum\t 3.00\tETHIOPIC DIGIT THREE\r\n", + "U+216b\t Ⅻ \t-\t-\tisnum\t12.00\tROMAN NUMERAL TWELVE\r\n", + "U+2466\t ⑦ \t-\tisdig\tisnum\t 7.00\tCIRCLED DIGIT SEVEN\r\n", + "U+2480\t ⒀ \t-\t-\tisnum\t13.00\tPARENTHESIZED NUMBER THIRTEEN\r\n", + "U+3285\t ㊅ \t-\t-\tisnum\t 6.00\tCIRCLED IDEOGRAPH SIX\r\n" + ] + } + ], + "source": [ + "!python3 numerics_demo.py" + ] + }, + { + "cell_type": "markdown", + "id": "3153f715", + "metadata": {}, + "source": [ + "## Dual-Mode str and bytes APIs" + ] + }, + { + "cell_type": "markdown", + "id": "ff0c0af0", + "metadata": {}, + "source": [ + "### str Versus bytes in Regular Expressions" + ] + }, + { + "cell_type": "markdown", + "id": "bffe3b5f", + "metadata": {}, + "source": [ + "#### Example 4-23. [ramanujan.py](ramanujan.py): compare behavior of simple str and bytes regular expressions" + ] + }, + { + "cell_type": "code", + "execution_count": 86, + "id": "7a831ae4", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Text\r\n", + " 'Ramanujan saw ௧௭௨௯ as 1729 = 1³ + 12³ = 9³ + 10³.'\r\n", + "Numbers\r\n", + " str : ['௧௭௨௯', '1729', '1', '12', '9', '10']\r\n", + " bytes: [b'1729', b'1', b'12', b'9', b'10']\r\n", + "Words\r\n", + " str : ['Ramanujan', 'saw', '௧௭௨௯', 'as', '1729', '1³', '12³', '9³', '10³']\r\n", + " bytes: [b'Ramanujan', b'saw', b'as', b'1729', b'1', b'12', b'9', b'10']\r\n" + ] + } + ], + "source": [ + "!python3 ramanujan.py" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.4" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/04-text-byte/README.rst b/04-text-byte/README.rst index 60e8138..de75cb6 100644 --- a/04-text-byte/README.rst +++ b/04-text-byte/README.rst @@ -1,4 +1,4 @@ -Sample code for Chapter 4 - "Text and bytes" +Sample code for Chapter 4 - "Unicode Text Versus Bytes" -From the book "Fluent Python" by Luciano Ramalho (O'Reilly, 2015) -http://shop.oreilly.com/product/0636920032519.do +From the book **Fluent Python, Second Edition** by Luciano Ramalho (O'Reilly, 2022). +https://learning.oreilly.com/library/view/fluent-python-2nd/9781492056348/