-
Notifications
You must be signed in to change notification settings - Fork 0
/
textwrapre.py
85 lines (71 loc) · 3.19 KB
/
textwrapre.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
from typing import Union, List
import regex
def wrapre(
text: Union[str, bytes],
blocksize: int,
regexsep: Union[str, bytes] = r"[\r\n]",
raisewhenlonger: bool = True,
removenewlines_from_result: bool = False,
*args,
**kwargs
) -> List[Union[str, bytes]]:
"""
Splits a given `text` into blocks of size `blocksize`, using the `regexsep` pattern as the separator.
If `raisewhenlonger` is True (default), raises a ValueError if any block is larger than `blocksize`.
If `removenewlines_from_result` is True, removes any newline characters from the resulting blocks.
*args and **kwargs are additional arguments that can be passed to the `regex.compile` function.
Args:
text (str/bytes): The text to be split into blocks.
blocksize (int): The maximum size of each block.
regexsep (str/bytes): The regular expression pattern used to separate the blocks. Defaults to r"[\r\n]".
raisewhenlonger (bool, optional): Whether to raise an error if any block is larger than `blocksize`. Defaults to True.
removenewlines_from_result (bool, optional): Whether to remove any newline characters from the resulting blocks. Defaults to False.
*args: Additional arguments to be passed to the `regex.compile` function.
**kwargs: Additional keyword arguments to be passed to the `regex.compile` function.
Returns:
list: A list of strings (or bytes, if `text` was a bytes object), where each element is a block of text of maximum size `blocksize`.
Raises:
ValueError: If `raisewhenlonger` is True and any block is larger than `blocksize`.
"""
spannow = -1
limit = blocksize
allspansdone = []
allf = text
isbytes = isinstance(text, bytes)
regexsepcom = regex.compile(regexsep, *args, **kwargs)
while allf:
oldlenallf = len(allf)
newlenaffl = oldlenallf
for ini, x in enumerate(
regexsepcom.finditer(allf, concurrent=True, partial=False)
):
spannowtemp = x.end()
if spannowtemp < limit:
spannow = spannowtemp
else:
allspansdone.append(allf[:spannow])
allf = allf[spannow:]
spannow = -1
newlenaffl = len(allf)
break
if oldlenallf == newlenaffl:
allspansdone.append(allf)
if not isbytes:
allf = ""
else:
allf = b""
if not allspansdone:
allspansdone.append(allf)
if raisewhenlonger:
if len([True for x in allspansdone if len(x) > limit]) != 0:
raise ValueError(
"Some blocks are bigger than the limit! Try again with another separator or a bigger limit!"
)
if removenewlines_from_result:
if isbytes:
newlinesbtypes = regex.compile(rb"[\r\n]+")
allspansdone = [newlinesbtypes.sub(b" ", x) for x in allspansdone]
else:
newlinesbstr = regex.compile(r"[\r\n]+")
allspansdone = [newlinesbstr.sub(" ", x) for x in allspansdone]
return allspansdone