Skip to content

Commit 8db09e3

Browse files
committed
added encodings
1 parent 0651fa3 commit 8db09e3

File tree

1 file changed

+263
-10
lines changed

1 file changed

+263
-10
lines changed

python_pd/05_files/01_files.ipynb

Lines changed: 263 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,8 @@
1515
"orig_nbformat": 2,
1616
"kernelspec": {
1717
"name": "python3",
18-
"display_name": "Python 3"
18+
"display_name": "Python 3",
19+
"language": "python"
1920
}
2021
},
2122
"nbformat": 4,
@@ -49,7 +50,7 @@
4950
},
5051
{
5152
"cell_type": "code",
52-
"execution_count": 22,
53+
"execution_count": 1,
5354
"metadata": {},
5455
"outputs": [
5556
{
@@ -103,7 +104,7 @@
103104
},
104105
{
105106
"cell_type": "code",
106-
"execution_count": 23,
107+
"execution_count": 2,
107108
"metadata": {},
108109
"outputs": [
109110
{
@@ -131,7 +132,7 @@
131132
},
132133
{
133134
"cell_type": "code",
134-
"execution_count": 25,
135+
"execution_count": 3,
135136
"metadata": {},
136137
"outputs": [
137138
{
@@ -166,7 +167,7 @@
166167
},
167168
{
168169
"cell_type": "code",
169-
"execution_count": 30,
170+
"execution_count": 4,
170171
"metadata": {},
171172
"outputs": [
172173
{
@@ -198,7 +199,7 @@
198199
},
199200
{
200201
"cell_type": "code",
201-
"execution_count": 27,
202+
"execution_count": 5,
202203
"metadata": {},
203204
"outputs": [
204205
{
@@ -226,7 +227,7 @@
226227
},
227228
{
228229
"cell_type": "code",
229-
"execution_count": 28,
230+
"execution_count": 6,
230231
"metadata": {},
231232
"outputs": [
232233
{
@@ -291,7 +292,7 @@
291292
},
292293
{
293294
"cell_type": "code",
294-
"execution_count": 31,
295+
"execution_count": 7,
295296
"metadata": {},
296297
"outputs": [],
297298
"source": [
@@ -302,7 +303,7 @@
302303
},
303304
{
304305
"cell_type": "code",
305-
"execution_count": 32,
306+
"execution_count": 8,
306307
"metadata": {},
307308
"outputs": [
308309
{
@@ -326,15 +327,267 @@
326327
"cell_type": "markdown",
327328
"metadata": {}
328329
},
330+
{
331+
"source": [
332+
"# Кодировки (модуль ```codecs```)\n",
333+
"\n",
334+
"Файлы как и строки хранят информацию в соответствии с определенной кодировкой. Существует большое количество кодировок, вот некоторые из них. "
335+
],
336+
"cell_type": "markdown",
337+
"metadata": {}
338+
},
339+
{
340+
"cell_type": "code",
341+
"execution_count": 9,
342+
"metadata": {},
343+
"outputs": [
344+
{
345+
"output_type": "stream",
346+
"name": "stdout",
347+
"text": [
348+
"len(all_encodings) = 326\n--------------------------------------------------\ndict_keys(['646', 'ansi_x3.4_1968', 'ansi_x3_4_1968', 'ansi_x3.4_1986', 'cp367', 'csascii', 'ibm367', 'iso646_us', 'iso_646.irv_1991', 'iso_ir_6', 'us', 'us_ascii', 'base64', 'base_64', 'big5_tw', 'csbig5', 'big5_hkscs', 'hkscs', 'bz2', '037', 'csibm037', 'ebcdic_cp_ca', 'ebcdic_cp_nl', 'ebcdic_cp_us', 'ebcdic_cp_wt', 'ibm037', 'ibm039', '1026', 'csibm1026', 'ibm1026', '1125', 'ibm1125', 'cp866u', 'ruscii', '1140', 'ibm1140', '1250', 'windows_1250', '1251', 'windows_1251', '1252', 'windows_1252', '1253', 'windows_1253', '1254', 'windows_1254', '1255', 'windows_1255', '1256', 'windows_1256', '1257', 'windows_1257', '1258', 'windows_1258', '273', 'ibm273', 'csibm273', '424', 'csibm424', 'ebcdic_cp_he', 'ibm424', '437', 'cspc8codepage437', 'ibm437', '500', 'csibm500', 'ebcdic_cp_be', 'ebcdic_cp_ch', 'ibm500', '775', 'cspc775baltic', 'ibm775', '850', 'cspc850multilingual', 'ibm850', '852', 'cspcp852', 'ibm852', '855', 'csibm855', 'ibm855', '857', 'csibm857', 'ibm857', '858', 'csibm858', 'ibm858', '860', 'csibm860', 'ibm860', '861', 'cp_is', 'csibm861', 'ibm861', '862', 'cspc862latinhebrew', 'ibm862', '863', 'csibm863', 'ibm863', '864', 'csibm864', 'ibm864', '865', 'csibm865', 'ibm865', '866', 'csibm866', 'ibm866', '869', 'cp_gr', 'csibm869', 'ibm869', '932', 'ms932', 'mskanji', 'ms_kanji', '949', 'ms949', 'uhc', '950', 'ms950', 'jisx0213', 'eucjis2004', 'euc_jis2004', 'eucjisx0213', 'eucjp', 'ujis', 'u_jis', 'euckr', 'korean', 'ksc5601', 'ks_c_5601', 'ks_c_5601_1987', 'ksx1001', 'ks_x_1001', 'gb18030_2000', 'chinese', 'csiso58gb231280', 'euc_cn', 'euccn', 'eucgb2312_cn', 'gb2312_1980', 'gb2312_80', 'iso_ir_58', '936', 'cp936', 'ms936', 'hex', 'roman8', 'r8', 'csHPRoman8', 'cp1051', 'ibm1051', 'hzgb', 'hz_gb', 'hz_gb_2312', 'csiso2022jp', 'iso2022jp', 'iso_2022_jp', 'iso2022jp_1', 'iso_2022_jp_1', 'iso2022jp_2', 'iso_2022_jp_2', 'iso_2022_jp_2004', 'iso2022jp_2004', 'iso2022jp_3', 'iso_2022_jp_3', 'iso2022jp_ext', 'iso_2022_jp_ext', 'csiso2022kr', 'iso2022kr', 'iso_2022_kr', 'csisolatin6', 'iso_8859_10', 'iso_8859_10_1992', 'iso_ir_157', 'l6', 'latin6', 'thai', 'iso_8859_11', 'iso_8859_11_2001', 'iso_8859_13', 'l7', 'latin7', 'iso_8859_14', 'iso_8859_14_1998', 'iso_celtic', 'iso_ir_199', 'l8', 'latin8', 'iso_8859_15', 'l9', 'latin9', 'iso_8859_16', 'iso_8859_16_2001', 'iso_ir_226', 'l10', 'latin10', 'csisolatin2', 'iso_8859_2', 'iso_8859_2_1987', 'iso_ir_101', 'l2', 'latin2', 'csisolatin3', 'iso_8859_3', 'iso_8859_3_1988', 'iso_ir_109', 'l3', 'latin3', 'csisolatin4', 'iso_8859_4', 'iso_8859_4_1988', 'iso_ir_110', 'l4', 'latin4', 'csisolatincyrillic', 'cyrillic', 'iso_8859_5', 'iso_8859_5_1988', 'iso_ir_144', 'arabic', 'asmo_708', 'csisolatinarabic', 'ecma_114', 'iso_8859_6', 'iso_8859_6_1987', 'iso_ir_127', 'csisolatingreek', 'ecma_118', 'elot_928', 'greek', 'greek8', 'iso_8859_7', 'iso_8859_7_1987', 'iso_ir_126', 'csisolatinhebrew', 'hebrew', 'iso_8859_8', 'iso_8859_8_1988', 'iso_ir_138', 'csisolatin5', 'iso_8859_9', 'iso_8859_9_1989', 'iso_ir_148', 'l5', 'latin5', 'cp1361', 'ms1361', 'cskoi8r', 'kz_1048', 'rk1048', 'strk1048_2002', '8859', 'cp819', 'csisolatin1', 'ibm819', 'iso8859', 'iso8859_1', 'iso_8859_1', 'iso_8859_1_1987', 'iso_ir_100', 'l1', 'latin', 'latin1', 'maccyrillic', 'macgreek', 'maciceland', 'maccentraleurope', 'mac_centeuro', 'maclatin2', 'macintosh', 'macroman', 'macturkish', 'ansi', 'dbcs', 'csptcp154', 'pt154', 'cp154', 'cyrillic_asian', 'quopri', 'quoted_printable', 'quotedprintable', 'rot13', 'csshiftjis', 'shiftjis', 'sjis', 's_jis', 'shiftjis2004', 'sjis_2004', 's_jis_2004', 'shiftjisx0213', 'sjisx0213', 's_jisx0213', 'tis620', 'tis_620_0', 'tis_620_2529_0', 'tis_620_2529_1', 'iso_ir_166', 'u16', 'utf16', 'unicodebigunmarked', 'utf_16be', 'unicodelittleunmarked', 'utf_16le', 'u32', 'utf32', 'utf_32be', 'utf_32le', 'u7', 'utf7', 'unicode_1_1_utf_7', 'u8', 'utf', 'utf8', 'utf8_ucs2', 'utf8_ucs4', 'cp65001', 'uu', 'zip', 'zlib', 'x_mac_japanese', 'x_mac_korean', 'x_mac_simp_chinese', 'x_mac_trad_chinese'])\n"
349+
]
350+
}
351+
],
352+
"source": [
353+
"import encodings\n",
354+
"\n",
355+
"all_encodings = encodings.aliases.aliases.keys()\n",
356+
"\n",
357+
"print(f'{len(all_encodings) = }')\n",
358+
"print('-' * 50)\n",
359+
"print(all_encodings)"
360+
]
361+
},
362+
{
363+
"source": [
364+
"Узнать системную кодировку можно с помощью модуля ```sys```."
365+
],
366+
"cell_type": "markdown",
367+
"metadata": {}
368+
},
369+
{
370+
"cell_type": "code",
371+
"execution_count": 10,
372+
"metadata": {},
373+
"outputs": [
374+
{
375+
"output_type": "stream",
376+
"name": "stdout",
377+
"text": [
378+
"utf-8\n"
379+
]
380+
}
381+
],
382+
"source": [
383+
"import sys\n",
384+
"\n",
385+
"print(sys.getdefaultencoding())"
386+
]
387+
},
388+
{
389+
"source": [
390+
"Для работы с кодировками существуют несколько разных модулей. Например, ```codecs``` предоставляет возможность поиска кодеков для разных кодировок и работу с ними. "
391+
],
392+
"cell_type": "markdown",
393+
"metadata": {}
394+
},
395+
{
396+
"cell_type": "code",
397+
"execution_count": 11,
398+
"metadata": {},
399+
"outputs": [
400+
{
401+
"output_type": "stream",
402+
"name": "stdout",
403+
"text": [
404+
"utf8 = <codecs.CodecInfo object for encoding utf-8 at 0x24e2124ebe0>\nutf8.name = 'utf-8'\nutf8.encode = <built-in function utf_8_encode>\nutf8.decode = <function decode at 0x0000024E21261700>\n"
405+
]
406+
}
407+
],
408+
"source": [
409+
"import codecs\n",
410+
"\n",
411+
"# поиск кодировки utf-8\n",
412+
"utf8 = codecs.lookup('utf-8')\n",
413+
"\n",
414+
"# возвращается специяальных объект CodecInfo, который \n",
415+
"# предоставляет доступ к функциям кодирования и декодирования.\n",
416+
"print(f'{utf8 = }')\n",
417+
"print(f'{utf8.name = }')\n",
418+
"print(f'{utf8.encode = }')\n",
419+
"print(f'{utf8.decode = }')"
420+
]
421+
},
422+
{
423+
"cell_type": "code",
424+
"execution_count": 12,
425+
"metadata": {},
426+
"outputs": [
427+
{
428+
"output_type": "stream",
429+
"name": "stdout",
430+
"text": [
431+
"bs = (b'foo', 3)\ns = ('foo', 3)\n"
432+
]
433+
}
434+
],
435+
"source": [
436+
"# Кодирование строки в байты (каждый символ занимает 1 байт)\n",
437+
"bs = utf8.encode('foo')\n",
438+
"print(f'{bs = }')\n",
439+
"\n",
440+
"# Декоридование строки из байт\n",
441+
"s = utf8.decode(bs[0])\n",
442+
"print(f'{s = }')"
443+
]
444+
},
445+
{
446+
"cell_type": "code",
447+
"execution_count": 13,
448+
"metadata": {},
449+
"outputs": [
450+
{
451+
"output_type": "stream",
452+
"name": "stdout",
453+
"text": [
454+
"bs = (b'\\xd0\\xbf\\xd0\\xb8\\xd1\\x82\\xd0\\xbe\\xd0\\xbd', 5)\ns = ('питон', 10)\n"
455+
]
456+
}
457+
],
458+
"source": [
459+
"# Кодирование строки в байты (каждый символ занимает 2 байта)\n",
460+
"bs = utf8.encode('питон')\n",
461+
"print(f'{bs = }')\n",
462+
"\n",
463+
"# Декоридование строки из байт\n",
464+
"s = utf8.decode(bs[0])\n",
465+
"print(f'{s = }')"
466+
]
467+
},
468+
{
469+
"source": [
470+
"Кодирование и декодирование строки можно происзводить разными кодировками. Адекватный результат при этом не гарантируется."
471+
],
472+
"cell_type": "markdown",
473+
"metadata": {}
474+
},
475+
{
476+
"cell_type": "code",
477+
"execution_count": 14,
478+
"metadata": {},
479+
"outputs": [
480+
{
481+
"output_type": "stream",
482+
"name": "stdout",
483+
"text": [
484+
"bs = (b'\\xe2\\xee\\xef\\xf0\\xee\\xf1', 6)\ns = ('БНОПНЯ', 6)\n"
485+
]
486+
}
487+
],
488+
"source": [
489+
"# поиск кодировки koi8-r\n",
490+
"koi8 = codecs.lookup('koi8-r')\n",
491+
"# поиск кодировки koi8-r\n",
492+
"cp1251 = codecs.lookup('cp1251')\n",
493+
"\n",
494+
"# Кодирование строки в байты с помощью cp1251\n",
495+
"bs = cp1251.encode('вопрос')\n",
496+
"print(f'{bs = }')\n",
497+
"\n",
498+
"# Декоридование строки из байт с помощью koi8-r\n",
499+
"s = koi8.decode(bs[0])\n",
500+
"print(f'{s = }')"
501+
]
502+
},
503+
{
504+
"source": [
505+
"В Python есть \"кодировка\", реализующая шифр Цезаря. "
506+
],
507+
"cell_type": "markdown",
508+
"metadata": {}
509+
},
510+
{
511+
"cell_type": "code",
512+
"execution_count": 15,
513+
"metadata": {},
514+
"outputs": [
515+
{
516+
"output_type": "stream",
517+
"name": "stdout",
518+
"text": [
519+
"\"rot13\" in all_encodings = True\n"
520+
]
521+
},
522+
{
523+
"output_type": "execute_result",
524+
"data": {
525+
"text/plain": [
526+
"('dhrfgvba', 8)"
527+
]
528+
},
529+
"metadata": {},
530+
"execution_count": 15
531+
}
532+
],
533+
"source": [
534+
"print(f'{\"rot13\" in all_encodings = }')\n",
535+
"\n",
536+
"rot13 = codecs.lookup('rot-13')\n",
537+
"\n",
538+
"rot13.encode('question')"
539+
]
540+
},
541+
{
542+
"source": [
543+
"## Модуль ```unicodedata```\n",
544+
"\n",
545+
"Этот модуль обеспечивает доступ к базе данных символов Unicode, которая определяет свойства символов для всех символов Unicode."
546+
],
547+
"cell_type": "markdown",
548+
"metadata": {}
549+
},
550+
{
551+
"cell_type": "code",
552+
"execution_count": 16,
553+
"metadata": {},
554+
"outputs": [
555+
{
556+
"output_type": "stream",
557+
"name": "stdout",
558+
"text": [
559+
"unicodedata.lookup(\"banana\") = '🍌'\nunicodedata.lookup(\"snake\") = '🐍'\nunicodedata.name(\"🗿\") = 'MOYAI'\nunicodedata.category(\"4\") = 'Nd': decimal number\nunicodedata.category(\"🛠\") = 'So': other symbol\n"
560+
]
561+
}
562+
],
563+
"source": [
564+
"import unicodedata\n",
565+
"\n",
566+
"# получение символа по названию\n",
567+
"print(f'{unicodedata.lookup(\"banana\") = }')\n",
568+
"print(f'{unicodedata.lookup(\"snake\") = }')\n",
569+
"\n",
570+
"# получение названия по символу\n",
571+
"print(f'{unicodedata.name(\"🗿\") = }')\n",
572+
"\n",
573+
"# получение категории символа\n",
574+
"print(f'{unicodedata.category(\"4\") = }: decimal number')\n",
575+
"print(f'{unicodedata.category(\"🛠\") = }: other symbol')"
576+
]
577+
},
329578
{
330579
"source": [
331580
"# Полезные ссылки\n",
332581
"\n",
333582
"- [File object](https://docs.python.org/3/glossary.html#term-file-object)\n",
334583
"- [Reading and Writing Files in Python (Guide)](https://realpython.com/read-write-files-python/)\n",
335584
"- [Документация к модулю ```io```](https://docs.python.org/3/library/io.html)\n",
585+
"- [Документация к модулю ```codecs```](https://docs.python.org/3.9/library/codecs.html)\n",
586+
"- [Документация к модулю ```unicodedata```](https://docs.python.org/3/library/unicodedata.html)\n",
336587
"- [Difference between modes a, a+, w, w+, and r+ in built-in open function?](https://stackoverflow.com/questions/1466000/difference-between-modes-a-a-w-w-and-r-in-built-in-open-function)\n",
337-
"- [Reading and Writing Files (документация)](https://docs.python.org/3.9/tutorial/inputoutput.html#reading-and-writing-files)"
588+
"- [Reading and Writing Files (документация)](https://docs.python.org/3.9/tutorial/inputoutput.html#reading-and-writing-files)\n",
589+
"- [How to determine the encoding of text?](https://stackoverflow.com/questions/436220/how-to-determine-the-encoding-of-text)\n",
590+
"- [Unicode HOWTO](https://docs.python.org/3.9/howto/unicode.html)"
338591
],
339592
"cell_type": "markdown",
340593
"metadata": {}

0 commit comments

Comments
 (0)