added encodings

redb0 · redb0 · commit 8db09e31020a · 2020-12-11T15:33:43.000+07:00
diff --git a/python_pd/05_files/01_files.ipynb b/python_pd/05_files/01_files.ipynb
@@ -15,7 +15,8 @@
   "orig_nbformat": 2,
   "kernelspec": {
    "name": "python3",
-   "display_name": "Python 3"
+   "display_name": "Python 3",
+   "language": "python"
   }
  },
  "nbformat": 4,
@@ -49,7 +50,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 22,
+   "execution_count": 1,
    "metadata": {},
    "outputs": [
     {
@@ -103,7 +104,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 23,
+   "execution_count": 2,
    "metadata": {},
    "outputs": [
     {
@@ -131,7 +132,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 25,
+   "execution_count": 3,
    "metadata": {},
    "outputs": [
     {
@@ -166,7 +167,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 30,
+   "execution_count": 4,
    "metadata": {},
    "outputs": [
     {
@@ -198,7 +199,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 27,
+   "execution_count": 5,
    "metadata": {},
    "outputs": [
     {
@@ -226,7 +227,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 28,
+   "execution_count": 6,
    "metadata": {},
    "outputs": [
     {
@@ -291,7 +292,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 31,
+   "execution_count": 7,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -302,7 +303,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 32,
+   "execution_count": 8,
    "metadata": {},
    "outputs": [
     {
@@ -326,15 +327,267 @@
    "cell_type": "markdown",
    "metadata": {}
   },
+  {
+   "source": [
+    "# Кодировки (модуль ```codecs```)\n",
+    "\n",
+    "Файлы как и строки хранят информацию в соответствии с определенной кодировкой. Существует большое количество кодировок, вот некоторые из них. "
+   ],
+   "cell_type": "markdown",
+   "metadata": {}
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [
+    {
+     "output_type": "stream",
+     "name": "stdout",
+     "text": [
+      "len(all_encodings) = 326\n--------------------------------------------------\ndict_keys(['646', 'ansi_x3.4_1968', 'ansi_x3_4_1968', 'ansi_x3.4_1986', 'cp367', 'csascii', 'ibm367', 'iso646_us', 'iso_646.irv_1991', 'iso_ir_6', 'us', 'us_ascii', 'base64', 'base_64', 'big5_tw', 'csbig5', 'big5_hkscs', 'hkscs', 'bz2', '037', 'csibm037', 'ebcdic_cp_ca', 'ebcdic_cp_nl', 'ebcdic_cp_us', 'ebcdic_cp_wt', 'ibm037', 'ibm039', '1026', 'csibm1026', 'ibm1026', '1125', 'ibm1125', 'cp866u', 'ruscii', '1140', 'ibm1140', '1250', 'windows_1250', '1251', 'windows_1251', '1252', 'windows_1252', '1253', 'windows_1253', '1254', 'windows_1254', '1255', 'windows_1255', '1256', 'windows_1256', '1257', 'windows_1257', '1258', 'windows_1258', '273', 'ibm273', 'csibm273', '424', 'csibm424', 'ebcdic_cp_he', 'ibm424', '437', 'cspc8codepage437', 'ibm437', '500', 'csibm500', 'ebcdic_cp_be', 'ebcdic_cp_ch', 'ibm500', '775', 'cspc775baltic', 'ibm775', '850', 'cspc850multilingual', 'ibm850', '852', 'cspcp852', 'ibm852', '855', 'csibm855', 'ibm855', '857', 'csibm857', 'ibm857', '858', 'csibm858', 'ibm858', '860', 'csibm860', 'ibm860', '861', 'cp_is', 'csibm861', 'ibm861', '862', 'cspc862latinhebrew', 'ibm862', '863', 'csibm863', 'ibm863', '864', 'csibm864', 'ibm864', '865', 'csibm865', 'ibm865', '866', 'csibm866', 'ibm866', '869', 'cp_gr', 'csibm869', 'ibm869', '932', 'ms932', 'mskanji', 'ms_kanji', '949', 'ms949', 'uhc', '950', 'ms950', 'jisx0213', 'eucjis2004', 'euc_jis2004', 'eucjisx0213', 'eucjp', 'ujis', 'u_jis', 'euckr', 'korean', 'ksc5601', 'ks_c_5601', 'ks_c_5601_1987', 'ksx1001', 'ks_x_1001', 'gb18030_2000', 'chinese', 'csiso58gb231280', 'euc_cn', 'euccn', 'eucgb2312_cn', 'gb2312_1980', 'gb2312_80', 'iso_ir_58', '936', 'cp936', 'ms936', 'hex', 'roman8', 'r8', 'csHPRoman8', 'cp1051', 'ibm1051', 'hzgb', 'hz_gb', 'hz_gb_2312', 'csiso2022jp', 'iso2022jp', 'iso_2022_jp', 'iso2022jp_1', 'iso_2022_jp_1', 'iso2022jp_2', 'iso_2022_jp_2', 'iso_2022_jp_2004', 'iso2022jp_2004', 'iso2022jp_3', 'iso_2022_jp_3', 'iso2022jp_ext', 'iso_2022_jp_ext', 'csiso2022kr', 'iso2022kr', 'iso_2022_kr', 'csisolatin6', 'iso_8859_10', 'iso_8859_10_1992', 'iso_ir_157', 'l6', 'latin6', 'thai', 'iso_8859_11', 'iso_8859_11_2001', 'iso_8859_13', 'l7', 'latin7', 'iso_8859_14', 'iso_8859_14_1998', 'iso_celtic', 'iso_ir_199', 'l8', 'latin8', 'iso_8859_15', 'l9', 'latin9', 'iso_8859_16', 'iso_8859_16_2001', 'iso_ir_226', 'l10', 'latin10', 'csisolatin2', 'iso_8859_2', 'iso_8859_2_1987', 'iso_ir_101', 'l2', 'latin2', 'csisolatin3', 'iso_8859_3', 'iso_8859_3_1988', 'iso_ir_109', 'l3', 'latin3', 'csisolatin4', 'iso_8859_4', 'iso_8859_4_1988', 'iso_ir_110', 'l4', 'latin4', 'csisolatincyrillic', 'cyrillic', 'iso_8859_5', 'iso_8859_5_1988', 'iso_ir_144', 'arabic', 'asmo_708', 'csisolatinarabic', 'ecma_114', 'iso_8859_6', 'iso_8859_6_1987', 'iso_ir_127', 'csisolatingreek', 'ecma_118', 'elot_928', 'greek', 'greek8', 'iso_8859_7', 'iso_8859_7_1987', 'iso_ir_126', 'csisolatinhebrew', 'hebrew', 'iso_8859_8', 'iso_8859_8_1988', 'iso_ir_138', 'csisolatin5', 'iso_8859_9', 'iso_8859_9_1989', 'iso_ir_148', 'l5', 'latin5', 'cp1361', 'ms1361', 'cskoi8r', 'kz_1048', 'rk1048', 'strk1048_2002', '8859', 'cp819', 'csisolatin1', 'ibm819', 'iso8859', 'iso8859_1', 'iso_8859_1', 'iso_8859_1_1987', 'iso_ir_100', 'l1', 'latin', 'latin1', 'maccyrillic', 'macgreek', 'maciceland', 'maccentraleurope', 'mac_centeuro', 'maclatin2', 'macintosh', 'macroman', 'macturkish', 'ansi', 'dbcs', 'csptcp154', 'pt154', 'cp154', 'cyrillic_asian', 'quopri', 'quoted_printable', 'quotedprintable', 'rot13', 'csshiftjis', 'shiftjis', 'sjis', 's_jis', 'shiftjis2004', 'sjis_2004', 's_jis_2004', 'shiftjisx0213', 'sjisx0213', 's_jisx0213', 'tis620', 'tis_620_0', 'tis_620_2529_0', 'tis_620_2529_1', 'iso_ir_166', 'u16', 'utf16', 'unicodebigunmarked', 'utf_16be', 'unicodelittleunmarked', 'utf_16le', 'u32', 'utf32', 'utf_32be', 'utf_32le', 'u7', 'utf7', 'unicode_1_1_utf_7', 'u8', 'utf', 'utf8', 'utf8_ucs2', 'utf8_ucs4', 'cp65001', 'uu', 'zip', 'zlib', 'x_mac_japanese', 'x_mac_korean', 'x_mac_simp_chinese', 'x_mac_trad_chinese'])\n"
+     ]
+    }
+   ],
+   "source": [
+    "import encodings\n",
+    "\n",
+    "all_encodings = encodings.aliases.aliases.keys()\n",
+    "\n",
+    "print(f'{len(all_encodings) = }')\n",
+    "print('-' * 50)\n",
+    "print(all_encodings)"
+   ]
+  },
+  {
+   "source": [
+    "Узнать системную кодировку можно с помощью модуля ```sys```."
+   ],
+   "cell_type": "markdown",
+   "metadata": {}
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [
+    {
+     "output_type": "stream",
+     "name": "stdout",
+     "text": [
+      "utf-8\n"
+     ]
+    }
+   ],
+   "source": [
+    "import sys\n",
+    "\n",
+    "print(sys.getdefaultencoding())"
+   ]
+  },
+  {
+   "source": [
+    "Для работы с кодировками существуют несколько разных модулей. Например, ```codecs``` предоставляет возможность поиска кодеков для разных кодировок и работу с ними. "
+   ],
+   "cell_type": "markdown",
+   "metadata": {}
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {},
+   "outputs": [
+    {
+     "output_type": "stream",
+     "name": "stdout",
+     "text": [
+      "utf8 = <codecs.CodecInfo object for encoding utf-8 at 0x24e2124ebe0>\nutf8.name = 'utf-8'\nutf8.encode = <built-in function utf_8_encode>\nutf8.decode = <function decode at 0x0000024E21261700>\n"
+     ]
+    }
+   ],
+   "source": [
+    "import codecs\n",
+    "\n",
+    "# поиск кодировки utf-8\n",
+    "utf8 = codecs.lookup('utf-8')\n",
+    "\n",
+    "# возвращается специяальных объект CodecInfo, который \n",
+    "# предоставляет доступ к функциям кодирования и декодирования.\n",
+    "print(f'{utf8 = }')\n",
+    "print(f'{utf8.name = }')\n",
+    "print(f'{utf8.encode = }')\n",
+    "print(f'{utf8.decode = }')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "metadata": {},
+   "outputs": [
+    {
+     "output_type": "stream",
+     "name": "stdout",
+     "text": [
+      "bs = (b'foo', 3)\ns = ('foo', 3)\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Кодирование строки в байты (каждый символ занимает 1 байт)\n",
+    "bs = utf8.encode('foo')\n",
+    "print(f'{bs = }')\n",
+    "\n",
+    "# Декоридование строки из байт\n",
+    "s = utf8.decode(bs[0])\n",
+    "print(f'{s = }')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "metadata": {},
+   "outputs": [
+    {
+     "output_type": "stream",
+     "name": "stdout",
+     "text": [
+      "bs = (b'\\xd0\\xbf\\xd0\\xb8\\xd1\\x82\\xd0\\xbe\\xd0\\xbd', 5)\ns = ('питон', 10)\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Кодирование строки в байты (каждый символ занимает 2 байта)\n",
+    "bs = utf8.encode('питон')\n",
+    "print(f'{bs = }')\n",
+    "\n",
+    "# Декоридование строки из байт\n",
+    "s = utf8.decode(bs[0])\n",
+    "print(f'{s = }')"
+   ]
+  },
+  {
+   "source": [
+    "Кодирование и декодирование строки можно происзводить разными кодировками. Адекватный результат при этом не гарантируется."
+   ],
+   "cell_type": "markdown",
+   "metadata": {}
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "metadata": {},
+   "outputs": [
+    {
+     "output_type": "stream",
+     "name": "stdout",
+     "text": [
+      "bs = (b'\\xe2\\xee\\xef\\xf0\\xee\\xf1', 6)\ns = ('БНОПНЯ', 6)\n"
+     ]
+    }
+   ],
+   "source": [
+    "# поиск кодировки koi8-r\n",
+    "koi8 = codecs.lookup('koi8-r')\n",
+    "# поиск кодировки koi8-r\n",
+    "cp1251 = codecs.lookup('cp1251')\n",
+    "\n",
+    "# Кодирование строки в байты с помощью cp1251\n",
+    "bs = cp1251.encode('вопрос')\n",
+    "print(f'{bs = }')\n",
+    "\n",
+    "# Декоридование строки из байт с помощью koi8-r\n",
+    "s = koi8.decode(bs[0])\n",
+    "print(f'{s = }')"
+   ]
+  },
+  {
+   "source": [
+    "В Python есть \"кодировка\", реализующая шифр Цезаря. "
+   ],
+   "cell_type": "markdown",
+   "metadata": {}
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "metadata": {},
+   "outputs": [
+    {
+     "output_type": "stream",
+     "name": "stdout",
+     "text": [
+      "\"rot13\" in all_encodings = True\n"
+     ]
+    },
+    {
+     "output_type": "execute_result",
+     "data": {
+      "text/plain": [
+       "('dhrfgvba', 8)"
+      ]
+     },
+     "metadata": {},
+     "execution_count": 15
+    }
+   ],
+   "source": [
+    "print(f'{\"rot13\" in all_encodings = }')\n",
+    "\n",
+    "rot13 = codecs.lookup('rot-13')\n",
+    "\n",
+    "rot13.encode('question')"
+   ]
+  },
+  {
+   "source": [
+    "## Модуль ```unicodedata```\n",
+    "\n",
+    "Этот модуль обеспечивает доступ к базе данных символов Unicode, которая определяет свойства символов для всех символов Unicode."
+   ],
+   "cell_type": "markdown",
+   "metadata": {}
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "metadata": {},
+   "outputs": [
+    {
+     "output_type": "stream",
+     "name": "stdout",
+     "text": [
+      "unicodedata.lookup(\"banana\") = '🍌'\nunicodedata.lookup(\"snake\") = '🐍'\nunicodedata.name(\"🗿\") = 'MOYAI'\nunicodedata.category(\"4\") = 'Nd': decimal number\nunicodedata.category(\"🛠\") = 'So': other symbol\n"
+     ]
+    }
+   ],
+   "source": [
+    "import unicodedata\n",
+    "\n",
+    "# получение символа по названию\n",
+    "print(f'{unicodedata.lookup(\"banana\") = }')\n",
+    "print(f'{unicodedata.lookup(\"snake\") = }')\n",
+    "\n",
+    "# получение названия по символу\n",
+    "print(f'{unicodedata.name(\"🗿\") = }')\n",
+    "\n",
+    "# получение категории символа\n",
+    "print(f'{unicodedata.category(\"4\") = }: decimal number')\n",
+    "print(f'{unicodedata.category(\"🛠\") = }: other symbol')"
+   ]
+  },
   {
    "source": [
     "# Полезные ссылки\n",
     "\n",
     "- [File object](https://docs.python.org/3/glossary.html#term-file-object)\n",
     "- [Reading and Writing Files in Python (Guide)](https://realpython.com/read-write-files-python/)\n",
     "- [Документация к модулю ```io```](https://docs.python.org/3/library/io.html)\n",
+    "- [Документация к модулю ```codecs```](https://docs.python.org/3.9/library/codecs.html)\n",
+    "- [Документация к модулю ```unicodedata```](https://docs.python.org/3/library/unicodedata.html)\n",
     "- [Difference between modes a, a+, w, w+, and r+ in built-in open function?](https://stackoverflow.com/questions/1466000/difference-between-modes-a-a-w-w-and-r-in-built-in-open-function)\n",
-    "- [Reading and Writing Files (документация)](https://docs.python.org/3.9/tutorial/inputoutput.html#reading-and-writing-files)"
+    "- [Reading and Writing Files (документация)](https://docs.python.org/3.9/tutorial/inputoutput.html#reading-and-writing-files)\n",
+    "- [How to determine the encoding of text?](https://stackoverflow.com/questions/436220/how-to-determine-the-encoding-of-text)\n",
+    "- [Unicode HOWTO](https://docs.python.org/3.9/howto/unicode.html)"
    ],
    "cell_type": "markdown",
    "metadata": {}