Skip to content

Commit 65d110d

Browse files
miss-islingtonserhiy-storchakaclaude
authored
[3.13] gh-152415: Exercise curses non-ASCII tests under 8-bit locale encodings (GH-152416) (GH-152453) (GH-152457)
The non-ASCII tests only exercised what the runner's locale could encode (in practice UTF-8). Add 8-bit-encoding cases to the character and string I/O tests, each guarded by the existing encodability check: ASCII, a character common to the Latin encodings ('é'), and ones distinctive to a single encoding (byte 0xA4 is '¤' in ISO-8859-1, '€' in ISO-8859-15, 'є' in KOI8-U). Run the whole suite under different locales to cover them; unrepresentable cases skip. * gh-152415: Verify character output round-trips in test_output_character Read each written character back with in_wch() or instr() rather than inch(), which on a wide build returns the low byte of the code point instead of the locale-encoded byte and so mangles a non-ASCII character of an 8-bit locale. This lets the int-argument cases cover '€'/'є', and adds matching coverage for the str argument. insch() with an int byte > 127 is checked only for Latin-1: on a wide build ncurses winsch stores a printable byte directly as a code point instead of decoding it through the locale. (cherry picked from commit 003d362) (cherry picked from commit a75aa41) Co-authored-by: Serhiy Storchaka <storchaka@gmail.com> Co-authored-by: Claude Opus 4.8 <noreply@anthropic.com>
1 parent 4b6a199 commit 65d110d

1 file changed

Lines changed: 242 additions & 27 deletions

File tree

Lib/test/test_curses.py

Lines changed: 242 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -255,6 +255,33 @@ def test_refresh_control(self):
255255
self.assertIs(win.is_wintouched(), syncok)
256256
self.assertIs(stdscr.is_wintouched(), syncok)
257257

258+
# Many tests below use a common set of non-ASCII cases, each applied only
259+
# when the window encoding can represent it -- so the whole suite is meant to
260+
# be run under several locales (e.g. ISO-8859-1, ISO-8859-15, KOI8-U):
261+
# 'A'/'a' ASCII
262+
# 'é' common to the Latin encodings
263+
# '¤'/'€'/'є' byte 0xA4 in ISO-8859-1 / ISO-8859-15 / KOI8-U
264+
# Precomposed characters are used so a round-trip does not depend on the form.
265+
266+
def _encodable(self, s):
267+
# Wide characters are only supported in a locale that can encode them.
268+
try:
269+
s.encode(self.stdscr.encoding)
270+
except UnicodeEncodeError:
271+
return False
272+
return True
273+
274+
def _read_char(self, y, x):
275+
# The character written to a cell, read back for output checks. inch()
276+
# is unusable here: on a wide build it returns the low 8 bits of the
277+
# character's code point rather than its locale-encoded byte, mangling
278+
# anything outside Latin-1. in_wch() reads the wide cell directly;
279+
# without it, instr() re-encodes the cell to the window encoding.
280+
stdscr = self.stdscr
281+
if hasattr(stdscr, 'in_wch'):
282+
return str(stdscr.in_wch(y, x))
283+
return stdscr.instr(y, x, 1).decode(stdscr.encoding)
284+
258285
def test_output_character(self):
259286
stdscr = self.stdscr
260287
encoding = stdscr.encoding
@@ -264,32 +291,98 @@ def test_output_character(self):
264291
stdscr.addch('A')
265292
stdscr.addch(b'A')
266293
stdscr.addch(65)
267-
c = '\u20ac'
268-
try:
269-
stdscr.addch(c)
270-
except UnicodeEncodeError:
271-
self.assertRaises(UnicodeEncodeError, c.encode, encoding)
272-
except OverflowError:
273-
encoded = c.encode(encoding)
274-
self.assertNotEqual(len(encoded), 1, repr(encoded))
294+
# See _encodable for the character set. Each is either written (mapped
295+
# to a single byte), or raises UnicodeEncodeError (not in the encoding)
296+
# or OverflowError (a multibyte sequence, e.g. in UTF-8).
297+
for c in ('A', '\u00e9', '\u00a4', '\u20ac', '\u0454'):
298+
try:
299+
stdscr.addch(c)
300+
except UnicodeEncodeError:
301+
self.assertRaises(UnicodeEncodeError, c.encode, encoding)
302+
except OverflowError:
303+
encoded = c.encode(encoding)
304+
self.assertNotEqual(len(encoded), 1, repr(encoded))
275305
stdscr.addch('A', curses.A_BOLD)
276306
stdscr.addch(1, 2, 'A')
277307
stdscr.addch(2, 3, 'A', curses.A_BOLD)
278308
self.assertIs(stdscr.is_wintouched(), True)
279309

310+
# The same characters supplied as an int chtype (a byte > 127). The
311+
# cell is read back with _read_char(), not inch(): on a wide build the
312+
# int is stored through the locale as a wide character that inch()
313+
# cannot represent for a character outside Latin-1.
314+
for c in ('é', '¤', '€', 'є'):
315+
try:
316+
b = c.encode(encoding)
317+
except UnicodeEncodeError:
318+
continue
319+
if len(b) != 1:
320+
continue
321+
# A wide build stores a character outside Latin-1 as a wide cell,
322+
# not as its encoded byte, so it cannot round-trip here.
323+
if ord(c) > 0xff and hasattr(stdscr, 'get_wch'):
324+
continue
325+
v = b[0]
326+
with self.subTest(c=c):
327+
stdscr.addch(0, 0, v)
328+
self.assertEqual(self._read_char(0, 0), c)
329+
stdscr.addch(0, 1, v, curses.A_BOLD)
330+
self.assertEqual(self._read_char(0, 1), c)
331+
self.assertTrue(stdscr.inch(0, 1) & curses.A_BOLD)
332+
stdscr.move(2, 0)
333+
stdscr.echochar(v)
334+
self.assertEqual(self._read_char(2, 0), c)
335+
# insch() round-trips a byte only where its code point equals
336+
# the byte value (Latin-1): on a wide build ncurses winsch
337+
# stores a printable byte directly as a code point instead of
338+
# decoding it through the locale.
339+
if ord(c) < 0x100:
340+
stdscr.insch(1, 0, v)
341+
self.assertEqual(self._read_char(1, 0), c)
342+
343+
# The same characters supplied as a str. Unlike the int path above, a
344+
# str is stored as a wide-character cell on a wide build, so every
345+
# encodable character round-trips, insch() included. A multibyte
346+
# character does not fit a cell on a narrow build and is skipped.
347+
wide = hasattr(stdscr, 'in_wch')
348+
for c in ('é', '¤', '€', 'є'):
349+
if not self._encodable(c):
350+
continue
351+
if not wide and len(c.encode(encoding)) != 1:
352+
continue
353+
# A wide build stores a character outside Latin-1 as a wide cell,
354+
# not as its encoded byte, so it cannot round-trip here.
355+
if ord(c) > 0xff and hasattr(stdscr, 'get_wch'):
356+
continue
357+
with self.subTest(c=c):
358+
stdscr.addch(0, 0, c)
359+
self.assertEqual(self._read_char(0, 0), c)
360+
stdscr.addch(0, 1, c, curses.A_BOLD)
361+
self.assertEqual(self._read_char(0, 1), c)
362+
self.assertTrue(stdscr.inch(0, 1) & curses.A_BOLD)
363+
stdscr.insch(1, 0, c)
364+
self.assertEqual(self._read_char(1, 0), c)
365+
stdscr.move(2, 0)
366+
stdscr.echochar(c)
367+
self.assertEqual(self._read_char(2, 0), c)
368+
280369
# echochar()
281370
stdscr.refresh()
282371
stdscr.move(0, 0)
283372
stdscr.echochar('A')
284373
stdscr.echochar(b'A')
285374
stdscr.echochar(65)
286-
with self.assertRaises((UnicodeEncodeError, OverflowError)):
287-
# Unicode is not fully supported yet, but at least it does
288-
# not crash.
289-
# It is supposed to fail because either the character is
290-
# not encodable with the current encoding, or it is encoded to
291-
# a multibyte sequence.
292-
stdscr.echochar('\u0114')
375+
# See _encodable for the character set; as in the addch() loop above.
376+
for c in ('A', '\u00e9', '\u00a4', '\u20ac', '\u0454'):
377+
try:
378+
stdscr.echochar(c)
379+
except UnicodeEncodeError:
380+
# The character is not encodable with the current encoding.
381+
self.assertRaises(UnicodeEncodeError, c.encode, encoding)
382+
except OverflowError:
383+
# The character is encoded to a multibyte sequence.
384+
encoded = c.encode(encoding)
385+
self.assertNotEqual(len(encoded), 1, repr(encoded))
293386
stdscr.echochar('A', curses.A_BOLD)
294387
self.assertIs(stdscr.is_wintouched(), False)
295388

@@ -299,14 +392,18 @@ def test_output_string(self):
299392
# addstr()/insstr()
300393
for func in [stdscr.addstr, stdscr.insstr]:
301394
with self.subTest(func.__qualname__):
302-
stdscr.move(0, 0)
303395
func('abcd')
304396
func(b'abcd')
305-
s = 'àßçđ'
306-
try:
307-
func(s)
308-
except UnicodeEncodeError:
309-
self.assertRaises(UnicodeEncodeError, s.encode, encoding)
397+
# Common and encoding-distinctive strings (see _encodable for the
398+
# 0xA4 set); 'àßçđ' is UTF-8-only. Each is written if the
399+
# encoding allows, else raises UnicodeEncodeError.
400+
for s in ('soupçon', 'àßçđ', 'soupçon ¤', 'soupçon €', 'дякую'):
401+
stdscr.move(0, 0)
402+
try:
403+
func(s)
404+
except UnicodeEncodeError:
405+
self.assertRaises(UnicodeEncodeError, s.encode, encoding)
406+
stdscr.move(0, 0)
310407
func('abcd', curses.A_BOLD)
311408
func(1, 2, 'abcd')
312409
func(2, 3, 'abcd', curses.A_BOLD)
@@ -317,11 +414,14 @@ def test_output_string(self):
317414
stdscr.move(0, 0)
318415
func('1234', 3)
319416
func(b'1234', 3)
320-
s = '\u0661\u0662\u0663\u0664'
321-
try:
322-
func(s, 3)
323-
except UnicodeEncodeError:
324-
self.assertRaises(UnicodeEncodeError, s.encode, encoding)
417+
# As above (see _encodable); Arabic-Indic digits are UTF-8-only.
418+
for s in ('caf\u00e9', '\u0661\u0662\u0663\u0664', 'caf\u00e9 \u00a4', 'caf\u00e9 \u20ac', '\u0434\u044f\u043a\u0443\u044e'):
419+
stdscr.move(0, 0)
420+
try:
421+
func(s, 3)
422+
except UnicodeEncodeError:
423+
self.assertRaises(UnicodeEncodeError, s.encode, encoding)
424+
stdscr.move(0, 0)
325425
func('1234', 5)
326426
func('1234', 3, curses.A_BOLD)
327427
func(1, 2, '1234', 3)
@@ -411,6 +511,24 @@ def test_read_from_window(self):
411511
self.assertEqual(stdscr.instr(0, 2, 4), b'BCD ')
412512
self.assertRaises(ValueError, stdscr.instr, -2)
413513
self.assertRaises(ValueError, stdscr.instr, 0, 2, -2)
514+
# A non-ASCII character of an 8-bit locale reads back as its encoded
515+
# byte (see _encodable for the set). instr() returns the locale bytes
516+
# for any single-byte character; inch() packs the text into a chtype, so
517+
# on a wide build it only round-trips a Latin-1 codepoint (byte ==
518+
# codepoint).
519+
encoding = stdscr.encoding
520+
for ch in ('A', 'é', '¤', '€', 'є'):
521+
try:
522+
b = ch.encode(encoding)
523+
except UnicodeEncodeError:
524+
continue
525+
if len(b) != 1:
526+
continue
527+
with self.subTest(ch=ch):
528+
stdscr.addstr(2, 0, ch)
529+
self.assertEqual(stdscr.instr(2, 0, 1), b)
530+
if ord(ch) < 0x100:
531+
self.assertEqual(stdscr.inch(2, 0) & curses.A_CHARTEXT, b[0])
414532

415533
def test_coordinate_errors(self):
416534
# Addressing a cell outside the window raises curses.error.
@@ -447,6 +565,10 @@ def test_getch(self):
447565
self.assertEqual(win.getch(), b'm'[0])
448566
self.assertEqual(win.getch(), b'\n'[0])
449567

568+
# A key value > 127 is delivered unchanged (it is not locale text).
569+
curses.ungetch(0xE9)
570+
self.assertEqual(win.getch(), 0xE9)
571+
450572
def test_getstr(self):
451573
win = curses.newwin(5, 12, 5, 2)
452574
curses.echo()
@@ -619,6 +741,33 @@ def test_background(self):
619741
self.assertEqual(win.inch(0, 0), b'L'[0] | curses.A_REVERSE)
620742
self.assertEqual(win.inch(0, 5), b'#'[0] | curses.A_REVERSE)
621743

744+
# A non-ASCII background character of an 8-bit locale reads back as its
745+
# encoded byte. See _encodable for the character set.
746+
win.bkgd(' ')
747+
encoding = win.encoding
748+
for ch in ('é', '¤', '€', 'є'):
749+
try:
750+
b = ch.encode(encoding)
751+
except UnicodeEncodeError:
752+
continue
753+
if len(b) != 1:
754+
continue
755+
# A wide build stores a character outside Latin-1 as a wide cell,
756+
# not as its encoded byte, so it cannot round-trip here.
757+
if ord(ch) > 0xff and hasattr(win, 'get_wch'):
758+
continue
759+
with self.subTest(ch=ch):
760+
win.bkgd(ch)
761+
self.assertEqual(win.getbkgd(), b[0])
762+
if ord(ch) < 0x100:
763+
# The same byte given as an int. A wide build stores it
764+
# through the locale, so only a Latin-1 byte round-trips.
765+
win.bkgd(' ')
766+
win.bkgdset(b[0])
767+
self.assertEqual(win.getbkgd(), b[0])
768+
win.bkgd(b[0])
769+
self.assertEqual(win.getbkgd(), b[0])
770+
622771
def test_overlay(self):
623772
srcwin = curses.newwin(5, 18, 3, 4)
624773
lorem_ipsum(srcwin)
@@ -711,6 +860,16 @@ def test_borders_and_lines(self):
711860
win.border(65, 66)
712861
win.border(65)
713862
win.border()
863+
# With no arguments, border() fills the edges with ACS line and corner
864+
# characters.
865+
chartext = curses.A_CHARTEXT
866+
maxy, maxx = win.getmaxyx()
867+
self.assertEqual(win.inch(0, 0) & chartext, curses.ACS_ULCORNER & chartext)
868+
self.assertEqual(win.inch(0, maxx-1) & chartext, curses.ACS_URCORNER & chartext)
869+
self.assertEqual(win.inch(maxy-1, 0) & chartext, curses.ACS_LLCORNER & chartext)
870+
self.assertEqual(win.inch(maxy-1, maxx-1) & chartext, curses.ACS_LRCORNER & chartext)
871+
self.assertEqual(win.inch(0, 1) & chartext, curses.ACS_HLINE & chartext)
872+
self.assertEqual(win.inch(1, 0) & chartext, curses.ACS_VLINE & chartext)
714873

715874
win.box(':', '~')
716875
self.assertEqual(win.instr(0, 1, 8), b'~~~~~~~~')
@@ -721,6 +880,11 @@ def test_borders_and_lines(self):
721880
self.assertRaises(TypeError, win.box, 65, 66, 67)
722881
self.assertRaises(TypeError, win.box, 65)
723882
win.box()
883+
# With no arguments, box() likewise draws ACS corners and lines.
884+
self.assertEqual(win.inch(0, 0) & chartext, curses.ACS_ULCORNER & chartext)
885+
self.assertEqual(win.inch(0, maxx-1) & chartext, curses.ACS_URCORNER & chartext)
886+
self.assertEqual(win.inch(0, 1) & chartext, curses.ACS_HLINE & chartext)
887+
self.assertEqual(win.inch(1, 0) & chartext, curses.ACS_VLINE & chartext)
724888

725889
win.move(1, 2)
726890
win.hline('-', 5)
@@ -742,6 +906,43 @@ def test_borders_and_lines(self):
742906
self.assertEqual(win.inch(2, 1), b';'[0] | curses.A_STANDOUT)
743907
self.assertEqual(win.inch(3, 1), b'a'[0])
744908

909+
# A border or line character of an 8-bit locale round-trips as its
910+
# encoded byte. See _encodable for the character set.
911+
encoding = win.encoding
912+
for ch in ('é', '¤', '€', 'є'):
913+
try:
914+
b = ch.encode(encoding)
915+
except UnicodeEncodeError:
916+
continue
917+
if len(b) != 1:
918+
continue
919+
# A wide build stores a character outside Latin-1 as a wide cell,
920+
# not as its encoded byte, so it cannot round-trip here.
921+
if ord(ch) > 0xff and hasattr(win, 'get_wch'):
922+
continue
923+
with self.subTest(ch=ch):
924+
win.erase()
925+
win.hline(2, 0, ch, 5)
926+
self.assertEqual(win.instr(2, 0, 5), b * 5)
927+
win.vline(0, 0, ch, 3)
928+
self.assertEqual(win.instr(0, 0, 1), b)
929+
self.assertEqual(win.instr(1, 0, 1), b)
930+
win.border(ch, ch, ch, ch, ch, ch, ch, ch)
931+
self.assertEqual(win.instr(0, 0), b * maxx)
932+
if ord(ch) < 0x100:
933+
# The same byte given as an int. A wide build stores it
934+
# through the locale, so only a Latin-1 byte round-trips.
935+
v = b[0]
936+
win.erase()
937+
win.hline(2, 0, v, 5)
938+
self.assertEqual(win.instr(2, 0, 5), b * 5)
939+
win.vline(0, 0, v, 3)
940+
self.assertEqual(win.instr(1, 0, 1), b)
941+
win.border(v, v, v, v, v, v, v, v)
942+
self.assertEqual(win.instr(0, 0), b * maxx)
943+
win.box(v, v)
944+
self.assertEqual(win.instr(0, 1, 1), b)
945+
745946
def test_unctrl(self):
746947
# TODO: wunctrl()
747948
self.assertEqual(curses.unctrl(b'A'), b'A')
@@ -750,6 +951,19 @@ def test_unctrl(self):
750951
self.assertEqual(curses.unctrl(b'\n'), b'^J')
751952
self.assertEqual(curses.unctrl('\n'), b'^J')
752953
self.assertEqual(curses.unctrl(10), b'^J')
954+
# A printable non-ASCII byte of an 8-bit locale is returned unchanged.
955+
# See _encodable for the character set.
956+
encoding = self.stdscr.encoding
957+
for ch in ('é', '¤', '€', 'є'):
958+
try:
959+
b = ch.encode(encoding)
960+
except UnicodeEncodeError:
961+
continue
962+
if len(b) != 1:
963+
continue
964+
with self.subTest(ch=ch):
965+
self.assertEqual(curses.unctrl(ch), b)
966+
self.assertEqual(curses.unctrl(b[0]), b) # the byte as an int
753967
self.assertRaises(TypeError, curses.unctrl, b'')
754968
self.assertRaises(TypeError, curses.unctrl, b'AB')
755969
self.assertRaises(TypeError, curses.unctrl, '')
@@ -1449,7 +1663,8 @@ def test_issue6243(self):
14491663
def test_unget_wch(self):
14501664
stdscr = self.stdscr
14511665
encoding = stdscr.encoding
1452-
for ch in ('a', '\xe9', '\u20ac', '\U0010FFFF'):
1666+
# See _encodable for the character set, plus a non-BMP character.
1667+
for ch in ('a', '\xe9', '\xa4', '\u20ac', '\u0454', '\U0010FFFF'):
14531668
try:
14541669
ch.encode(encoding)
14551670
except UnicodeEncodeError:

0 commit comments

Comments
 (0)