@@ -1001,8 +1001,9 @@ def cell_activated(self, row_idx, column_idx):
10011001@adapter_for ('pathlib.Path' )
10021002def get_path_suffix_adapter (fpath ):
10031003 logger .debug (f"get_path_suffix_adapter('{ fpath } ')" )
1004- if fpath .suffix .lower () in PATH_SUFFIX_ADAPTERS :
1005- path_adapter_cls , required_module = PATH_SUFFIX_ADAPTERS [fpath .suffix ]
1004+ suffix = fpath .suffix .lower ()
1005+ if suffix in PATH_SUFFIX_ADAPTERS :
1006+ path_adapter_cls , required_module = PATH_SUFFIX_ADAPTERS [suffix ]
10061007 if required_module is not None :
10071008 if required_module not in sys .modules :
10081009 import importlib
@@ -1013,7 +1014,15 @@ def get_path_suffix_adapter(fpath):
10131014 f"which is required to handle { fpath .suffix } "
10141015 f"files" )
10151016 return None
1016- return path_adapter_cls
1017+ # 2 options:
1018+ # - either there is a single adapter for that suffix
1019+ if (isinstance (path_adapter_cls , type ) and
1020+ issubclass (path_adapter_cls , AbstractAdapter )):
1021+ return path_adapter_cls
1022+ # - different adapters handle that suffix and/or not all instances can
1023+ # be handled
1024+ else :
1025+ return path_adapter_cls (fpath )
10171026 elif fpath .is_dir ():
10181027 return DirectoryPathAdapter
10191028 else :
@@ -2585,9 +2594,9 @@ def open(cls, fpath):
25852594
25862595
25872596# TODO: options to display as hex or decimal
2588- # >>> s = f.read(20 )
2597+ # >>> s = f.read(10 )
25892598# >>> s
2590- # b'\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\ xc2\xea\x81\xb3\x14\x11\xcf\xbd
2599+ # b'\x00\x00\xc2\xea\x81\xb3\x14\x11\xcf\xbd
25912600@adapter_for ('_io.BufferedReader' )
25922601class BinaryFileAdapter (AbstractAdapter ):
25932602 def __init__ (self , data , attributes ):
@@ -2771,8 +2780,12 @@ def _detect_encoding(self, chunk):
27712780 try :
27722781 import charset_normalizer
27732782 chartset_match = charset_normalizer .from_bytes (chunk ).best ()
2774- self ._encoding = chartset_match .encoding
2775- logger .debug (f"encoding detected as { self ._encoding } " )
2783+ if chartset_match is None :
2784+ self ._encoding = None
2785+ logger .debug ("could not detect encoding from chunk" )
2786+ else :
2787+ self ._encoding = chartset_match .encoding
2788+ logger .debug (f"encoding detected as { self ._encoding } " )
27762789 except ImportError :
27772790 logger .debug ("could not import 'charset_normalizer' => cannot detect encoding" )
27782791
@@ -3159,6 +3172,31 @@ def open(cls, fpath):
31593172 return duckdb .connect (fpath )
31603173
31613174
3175+ class CSVGZPathAdapater (CsvFileAdapter ):
3176+ @classmethod
3177+ def open (cls , fpath ):
3178+ import gzip
3179+ # not specifying an encoding is not an option because in that case
3180+ # we would get bytes and not str, which makes csv reader unhappy
3181+ return gzip .open (fpath , mode = 'rt' , encoding = 'utf-8' )
3182+
3183+ @property
3184+ def _binary_file (self ):
3185+ import gzip
3186+ return gzip .open (self .data .name , mode = 'rb' )
3187+
3188+
3189+ @path_adapter_for ('.gz' , 'gzip' )
3190+ def dispatch_gzip_path_adapter (gz_path ):
3191+ # strip .gz extension and dispatch to appropriate adapter
3192+ fpath = gz_path .with_name (gz_path .stem )
3193+ suffix = fpath .suffix .lower ()
3194+ if suffix == '.csv' :
3195+ return CSVGZPathAdapater
3196+ else :
3197+ return None
3198+
3199+
31623200@adapter_for ('zipfile.ZipFile' )
31633201class ZipFileAdapter (AbstractColumnarAdapter ):
31643202 def __init__ (self , data , attributes ):
0 commit comments