diff --git a/_modules/infer.html b/_modules/infer.html index 6a9a573..a89b945 100644 --- a/_modules/infer.html +++ b/_modules/infer.html @@ -295,18 +295,58 @@
return dtype_guess
-def type_check_date(element: object) -> str:
+[docs]def type_check_date(element: object) -> str:
+ """
+ Check if element corresponds to a date-like object.
+ """
+ # check if element represents a date (no hour/minute/seconds)
+ is_date = False
+ # check if element represents a datetime (has hour/minute/seconds)
+ is_datetime = False
+ # check if it makes sense to convert element to unix time-stamp by
+ # evaluating if, when converted, the element represents a number that
+ # is compatible with a Unix timestamp (number of seconds since 1970-01-01T:00:00:00)
+ # note that we also check the number is not larger than the "epochalypse time",
+ # which is when the unix timestamp becomes larger than 2^32 - 1 seconds. We do
+ # this because timestamps outside this range are likely to be unreliable and hence
+ # rather treated as every-day numbers.
+ min_dt = pd.to_datetime('1970-01-01 00:00:00', utc=True)
+ max_dt = pd.to_datetime('2038-01-19 03:14:08', utc=True)
+ valid_units = {'ns': 'unix', 'us': 'unix', 'ms': 'unix', 's': 'unix',
+ 'D': 'julian'}
+ for unit, origin in valid_units.items():
+ try:
+ as_dt = pd.to_datetime(element, unit=unit, origin=origin,
+ errors='raise')
+ if min_dt < as_dt < max_dt:
+ is_datetime = True
+ break
+ except Exception:
+ pass
+ # check if element represents a date-like object.
+ # here we don't check for a validity range like with unix-timestamps
+ # because dates as string usually represent something more general than
+ # just the number of seconds since an epoch.
try:
- dt = pd.to_datetime(element)
-
- # Not accurate 100% for a single datetime str, but should work in aggregate
- if dt.hour == 0 and dt.minute == 0 and dt.second == 0 and len(str(element)) <= 16:
- return dtype.date
- else:
- return dtype.datetime
-
- except ValueError:
- return None
+ as_dt = pd.to_datetime(element, errors='raise')
+ is_datetime = True
+ except Exception:
+ pass
+ # finally, if element is represents a datetime object, check if only
+ # date part is contained (no time information)
+ if is_datetime:
+ # round element day (drop hour/minute/second)
+ dt_d = as_dt.to_period('D').to_timestamp()
+ # if rounded datetime equals the datetime itself, it means there was not
+ # hour/minute/second information to begin with. Mind the 'localize' to
+ # avoid time-zone BS to kick in.
+ is_date = dt_d == as_dt.tz_localize(None)
+ if is_date:
+ return dtype.date
+ if is_datetime:
+ return dtype.datetime
+
+ return None
def count_data_types_in_column(data):
@@ -559,7 +599,7 @@ Source code for infer
population_size = len(data)
log.info(f'Analyzing a sample of {sample_size}')
log.info(
- f'from a total population of {population_size}, this is equivalent to {round(sample_size*100/population_size, 1)}% of your data.') # noqa
+ f'from a total population of {population_size}, this is equivalent to {round(sample_size*100/population_size, 1)}% of your data.') # noqa
nr_procs = get_nr_procs(df=sample_df)
pool_size = min(nr_procs, len(sample_df.columns.values))
diff --git a/_static/pygments.css b/_static/pygments.css
index 691aeb8..0d49244 100644
--- a/_static/pygments.css
+++ b/_static/pygments.css
@@ -17,6 +17,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
.highlight .cs { color: #408090; background-color: #fff0f0 } /* Comment.Special */
.highlight .gd { color: #A00000 } /* Generic.Deleted */
.highlight .ge { font-style: italic } /* Generic.Emph */
+.highlight .ges { font-weight: bold; font-style: italic } /* Generic.EmphStrong */
.highlight .gr { color: #FF0000 } /* Generic.Error */
.highlight .gh { color: #000080; font-weight: bold } /* Generic.Heading */
.highlight .gi { color: #00A000 } /* Generic.Inserted */
diff --git a/genindex.html b/genindex.html
index 20b121a..df3e9b0 100644
--- a/genindex.html
+++ b/genindex.html
@@ -291,6 +291,8 @@ T
diff --git a/index.html b/index.html
index 1523066..386f4f9 100644
--- a/index.html
+++ b/index.html
@@ -175,7 +175,7 @@ Type Infer0.0.15
Date
-Aug 01, 2023
+Aug 21, 2023
@@ -298,6 +298,7 @@ Other Linksget_column_data_type
get_numeric_type
infer_types
+type_check_date
Helpers
diff --git a/infer.html b/infer.html
index 36c682b..7ce84fa 100644
--- a/infer.html
+++ b/infer.html
@@ -105,6 +105,7 @@
- get_column_data_type
- get_numeric_type
- infer_types
+- type_check_date
Helpers
@@ -260,6 +261,17 @@
+
+
diff --git a/objects.inv b/objects.inv
index c65b4d3..c976455 100644
Binary files a/objects.inv and b/objects.inv differ
diff --git a/searchindex.js b/searchindex.js
index 17ab8a5..77a51c2 100644
--- a/searchindex.js
+++ b/searchindex.js
@@ -1 +1 @@
-Search.setIndex({docnames:["base","dtype","helpers","index","infer"],envversion:{"sphinx.domains.c":2,"sphinx.domains.changeset":1,"sphinx.domains.citation":1,"sphinx.domains.cpp":4,"sphinx.domains.index":1,"sphinx.domains.javascript":2,"sphinx.domains.math":2,"sphinx.domains.python":3,"sphinx.domains.rst":2,"sphinx.domains.std":2,"sphinx.ext.viewcode":1,nbsphinx:4,sphinx:56},filenames:["base.rst","dtype.rst","helpers.rst","index.rst","infer.rst"],objects:{"":{base:[0,0,0,"-"],dtype:[1,0,0,"-"],helpers:[2,0,0,"-"],infer:[4,0,0,"-"]},base:{TypeInformation:[0,1,1,""]},dtype:{dtype:[1,1,1,""]},helpers:{cast_string_to_python_type:[2,2,1,""],is_nan_numeric:[2,2,1,""],tokenize_text:[2,2,1,""]},infer:{calculate_sample_size:[4,2,1,""],get_column_data_type:[4,2,1,""],get_numeric_type:[4,2,1,""],infer_types:[4,2,1,""]}},objnames:{"0":["py","module","Python module"],"1":["py","class","Python class"],"2":["py","function","Python function"]},objtypes:{"0":"py:module","1":"py:class","2":"py:function"},terms:{"0":[3,4],"01":[3,4],"1":4,"100":[1,4],"10000":4,"15":3,"1e4":4,"2":4,"2023":3,"420":4,"5":4,"995":4,"abstract":0,"case":[3,4],"class":[0,1],"default":[3,4],"do":[0,1,3],"export":3,"float":[1,2,4],"import":3,"int":4,"long":1,"new":3,"return":[2,4],"short":1,"switch":4,"true":4,Be:3,By:3,For:[0,4],If:[3,4],In:3,The:[0,1,2,4],These:0,To:3,abid:3,abov:3,accept:4,achiev:4,actual:2,ad:3,add:3,addit:[0,2,3],additional_info:0,address:3,affect:3,agre:3,agreement:3,ai:0,aim:3,all:[1,2,3],allow:3,altern:1,an:[1,2,3,4],analysi:0,analyz:4,ani:0,announc:3,anyth:1,ar:[0,1,3,4],arrai:1,ask:3,associ:0,audio:1,aug:3,autom:3,automat:3,avail:1,base:3,bashrc:3,basi:3,becaus:1,becom:3,befor:[3,4],being:0,belong:4,benchmark:3,between:4,binari:1,bool:2,branch:3,calcul:4,calculate_sample_s:[3,4],can:[1,2,4],cast:2,cast_string_to_python_typ:[2,3],categor:1,categori:0,cd:3,cell:4,certain:4,chang:3,charact:1,chat:3,check:3,checkout:3,ci:3,cli:3,clone:3,codebas:2,col_nam:4,column:[0,1,3,4],commit:3,complex:1,comprehens:2,confid:4,confidence_level:4,config:0,confirm:3,consid:1,contain:[0,4],continu:3,core:3,correctli:1,count:4,creat:3,current:1,custom:[0,1],data:[0,3,4],data_typ:4,datafram:[3,4],dataset:[0,3,4],date:[1,3],datetim:1,deal:1,defin:[1,4],definit:1,democrat:3,descript:0,descriptor:1,determin:[2,4],develop:3,deviat:4,differ:4,directori:3,discov:3,discret:1,discuss:3,distribut:4,document:3,doe:3,doesn:3,done:3,draw:4,drawn:4,dtype:[0,1,3],e:[2,3,4],each:[0,3,4],easi:3,edit:[1,3],element:4,empti:1,end:3,env:3,equal:4,error:4,estim:4,evalu:0,event:3,except:3,exist:1,expect:4,experiment:1,explicitli:1,fail:4,fals:4,feedback:3,file:[0,3],fill:3,first:3,fix:3,flag:1,flake8:3,follow:3,fork:3,form:1,frame:4,framework:0,from:[2,3,4],full_data:4,g:3,gener:[0,1,2,3,4],get:3,get_column_data_typ:[3,4],get_numeric_typ:[3,4],git:3,github:3,go:3,gpl:3,greater:1,guarante:3,ha:[1,2,4],have:3,hear:3,helper:3,here:[1,3],highli:[0,1],how:[0,1,4],i:[2,4],id:0,identifi:0,ignor:0,imag:1,implement:[1,3],improv:3,includ:1,indic:[0,1],inf:2,infer:[0,1],infer_typ:[0,3,4],inform:[0,1,4],informat:0,input:[3,4],insid:3,instead:[2,3],integ:[1,2,4],interest:3,interv:4,invalid:[1,4],is_nan_numer:[2,3],issu:3,item:4,iter:4,its:[3,4],join:3,json:0,kind:0,know:3,label:1,languag:1,larg:4,later:4,latest:3,learn:3,let:3,level:[1,4],lie:4,limit:1,list:[2,4],local:3,logic:4,love:3,m:3,machin:3,mai:[0,1],main:[0,4],make:3,mani:4,manual:0,margin:4,margin_error:4,maximum:4,mean:4,memori:2,merg:3,method:2,might:2,mindsdb:3,minim:4,miscellan:1,mission:3,ml:4,model:0,modul:[0,1,2,4],monthli:3,most:3,mp_cutoff:4,multipl:3,must:1,name:[0,1],nan:2,ndarrai:4,need:3,newli:3,newlin:3,newslett:3,none:2,normal:1,note:[3,4],number:[1,2,4],numer:[1,2],open:3,opinion:3,optim:2,option:4,order:1,other:[0,2],our:3,out:3,overrid:[0,1],own:[1,3],packag:[0,3],panda:3,parallel:4,paramet:[0,4],part:1,particip:3,pass:3,path:3,pct_invalid:4,pd:4,percentag:4,perform:3,pip3:3,pip:3,pipelin:1,pleas:[1,3],popul:4,population_s:4,possibl:[0,1],potenti:0,pr:3,preprocess:1,preserv:1,previou:3,privat:3,procedur:0,process:[1,4],project:3,propos:3,provid:[0,4],pull:3,push:3,python2:3,python3:3,python:3,pythonpath:3,qualiti:4,quantiti:1,question:3,random:4,re:0,read:3,readi:4,receiv:3,recommend:[0,3],regular:3,releas:3,remot:3,repo:3,repositori:3,repres:1,request:3,requir:[1,3],respond:3,rich:1,rich_text:1,row:1,run:3,runtim:2,s:[0,3,4],sampl:[0,4],sample_paramet:4,scientist:3,seed:4,seed_nr:4,sequenc:1,sequenti:1,seri:1,short_text:1,should:[1,3,4],sigma:4,sign:3,sinc:3,size:4,small:[0,1,4],so:[0,1,3],solv:3,some:2,sourc:[0,1,2,4],stabl:3,stage:3,standard:4,step:3,str:4,string:[2,4],style:3,sub:0,submit:3,subsequ:[0,1],subtyp:4,suffici:4,suit:3,support:1,sure:3,suspect:0,t:3,tabular:3,tag:1,team:3,techniqu:1,tempor:1,term:3,test:3,text:[1,2],thei:[0,1],them:3,therefor:0,thi:[3,4],throughout:0,time:1,tokenize_text:[2,3],total:4,track:3,train:0,transform:1,treat:1,treatment:1,tsarrai:1,type:[0,2,4],type_distribut:4,type_inf:3,typeinform:[0,3,4],under:3,understand:0,union:4,unit:3,unittest:3,unknown:1,unless:0,updat:3,us:[0,2,4],usag:2,user:0,valid:3,valu:[0,1,2,4],version:3,versu:1,video:1,virtual:3,vocabulari:1,want:[1,3,4],we:[3,4],welcom:[1,3],went:3,were:4,what:0,where:[0,1,3,4],which:[2,3,4],whole:4,within:[0,4],word:1,work:3,workflow:3,x:3,you:1,your:[1,3]},titles:["Base
","Data types
","Helpers
","Type Infer","Infer
"],titleterms:{base:0,bug:3,can:3,code:3,commun:3,conduct:3,contribut:3,contributor:3,data:1,dev:3,environ:3,featur:3,guid:3,help:3,helper:2,how:3,infer:[3,4],instal:3,licens:3,link:3,other:3,process:3,quick:3,report:3,review:3,set:3,start:3,type:[1,3],up:3,us:3,you:3}})
\ No newline at end of file
+Search.setIndex({docnames:["base","dtype","helpers","index","infer"],envversion:{"sphinx.domains.c":2,"sphinx.domains.changeset":1,"sphinx.domains.citation":1,"sphinx.domains.cpp":4,"sphinx.domains.index":1,"sphinx.domains.javascript":2,"sphinx.domains.math":2,"sphinx.domains.python":3,"sphinx.domains.rst":2,"sphinx.domains.std":2,"sphinx.ext.viewcode":1,nbsphinx:4,sphinx:56},filenames:["base.rst","dtype.rst","helpers.rst","index.rst","infer.rst"],objects:{"":{base:[0,0,0,"-"],dtype:[1,0,0,"-"],helpers:[2,0,0,"-"],infer:[4,0,0,"-"]},base:{TypeInformation:[0,1,1,""]},dtype:{dtype:[1,1,1,""]},helpers:{cast_string_to_python_type:[2,2,1,""],is_nan_numeric:[2,2,1,""],tokenize_text:[2,2,1,""]},infer:{calculate_sample_size:[4,2,1,""],get_column_data_type:[4,2,1,""],get_numeric_type:[4,2,1,""],infer_types:[4,2,1,""],type_check_date:[4,2,1,""]}},objnames:{"0":["py","module","Python module"],"1":["py","class","Python class"],"2":["py","function","Python function"]},objtypes:{"0":"py:module","1":"py:class","2":"py:function"},terms:{"0":[3,4],"01":4,"1":4,"100":[1,4],"10000":4,"15":3,"1e4":4,"2":4,"2023":3,"21":3,"420":4,"5":4,"995":4,"abstract":0,"case":[3,4],"class":[0,1],"default":[3,4],"do":[0,1,3],"export":3,"float":[1,2,4],"import":3,"int":4,"long":1,"new":3,"return":[2,4],"short":1,"switch":4,"true":4,Be:3,By:3,For:[0,4],If:[3,4],In:3,The:[0,1,2,4],These:0,To:3,abid:3,abov:3,accept:4,achiev:4,actual:2,ad:3,add:3,addit:[0,2,3],additional_info:0,address:3,affect:3,agre:3,agreement:3,ai:0,aim:3,all:[1,2,3],allow:3,altern:1,an:[1,2,3,4],analysi:0,analyz:4,ani:0,announc:3,anyth:1,ar:[0,1,3,4],arrai:1,ask:3,associ:0,audio:1,aug:3,autom:3,automat:3,avail:1,base:3,bashrc:3,basi:3,becaus:1,becom:3,befor:[3,4],being:0,belong:4,benchmark:3,between:4,binari:1,bool:2,branch:3,calcul:4,calculate_sample_s:[3,4],can:[1,2,4],cast:2,cast_string_to_python_typ:[2,3],categor:1,categori:0,cd:3,cell:4,certain:4,chang:3,charact:1,chat:3,check:[3,4],checkout:3,ci:3,cli:3,clone:3,codebas:2,col_nam:4,column:[0,1,3,4],commit:3,complex:1,comprehens:2,confid:4,confidence_level:4,config:0,confirm:3,consid:1,contain:[0,4],continu:3,core:3,correctli:1,correspond:4,count:4,creat:3,current:1,custom:[0,1],data:[0,3,4],data_typ:4,datafram:[3,4],dataset:[0,3,4],date:[1,3,4],datetim:1,deal:1,defin:[1,4],definit:1,democrat:3,descript:0,descriptor:1,determin:[2,4],develop:3,deviat:4,differ:4,directori:3,discov:3,discret:1,discuss:3,distribut:4,document:3,doe:3,doesn:3,done:3,draw:4,drawn:4,dtype:[0,1,3],e:[2,3,4],each:[0,3,4],easi:3,edit:[1,3],element:4,empti:1,end:3,env:3,equal:4,error:4,estim:4,evalu:0,event:3,except:3,exist:1,expect:4,experiment:1,explicitli:1,fail:4,fals:4,feedback:3,file:[0,3],fill:3,first:3,fix:3,flag:1,flake8:3,follow:3,fork:3,form:1,frame:4,framework:0,from:[2,3,4],full_data:4,g:3,gener:[0,1,2,3,4],get:3,get_column_data_typ:[3,4],get_numeric_typ:[3,4],git:3,github:3,go:3,gpl:3,greater:1,guarante:3,ha:[1,2,4],have:3,hear:3,helper:3,here:[1,3],highli:[0,1],how:[0,1,4],i:[2,4],id:0,identifi:0,ignor:0,imag:1,implement:[1,3],improv:3,includ:1,indic:[0,1],inf:2,infer:[0,1],infer_typ:[0,3,4],inform:[0,1,4],informat:0,input:[3,4],insid:3,instead:[2,3],integ:[1,2,4],interest:3,interv:4,invalid:[1,4],is_nan_numer:[2,3],issu:3,item:4,iter:4,its:[3,4],join:3,json:0,kind:0,know:3,label:1,languag:1,larg:4,later:4,latest:3,learn:3,let:3,level:[1,4],lie:4,like:4,limit:1,list:[2,4],local:3,logic:4,love:3,m:3,machin:3,mai:[0,1],main:[0,4],make:3,mani:4,manual:0,margin:4,margin_error:4,maximum:4,mean:4,memori:2,merg:3,method:2,might:2,mindsdb:3,minim:4,miscellan:1,mission:3,ml:4,model:0,modul:[0,1,2,4],monthli:3,most:3,mp_cutoff:4,multipl:3,must:1,name:[0,1],nan:2,ndarrai:4,need:3,newli:3,newlin:3,newslett:3,none:2,normal:1,note:[3,4],number:[1,2,4],numer:[1,2],object:4,open:3,opinion:3,optim:2,option:4,order:1,other:[0,2],our:3,out:3,overrid:[0,1],own:[1,3],packag:[0,3],panda:3,parallel:4,paramet:[0,4],part:1,particip:3,pass:3,path:3,pct_invalid:4,pd:4,percentag:4,perform:3,pip3:3,pip:3,pipelin:1,pleas:[1,3],popul:4,population_s:4,possibl:[0,1],potenti:0,pr:3,preprocess:1,preserv:1,previou:3,privat:3,procedur:0,process:[1,4],project:3,propos:3,provid:[0,4],pull:3,push:3,python2:3,python3:3,python:3,pythonpath:3,qualiti:4,quantiti:1,question:3,random:4,re:0,read:3,readi:4,receiv:3,recommend:[0,3],regular:3,releas:3,remot:3,repo:3,repositori:3,repres:1,request:3,requir:[1,3],respond:3,rich:1,rich_text:1,row:1,run:3,runtim:2,s:[0,3,4],sampl:[0,4],sample_paramet:4,scientist:3,seed:4,seed_nr:4,sequenc:1,sequenti:1,seri:1,short_text:1,should:[1,3,4],sigma:4,sign:3,sinc:3,size:4,small:[0,1,4],so:[0,1,3],solv:3,some:2,sourc:[0,1,2,4],stabl:3,stage:3,standard:4,step:3,str:4,string:[2,4],style:3,sub:0,submit:3,subsequ:[0,1],subtyp:4,suffici:4,suit:3,support:1,sure:3,suspect:0,t:3,tabular:3,tag:1,team:3,techniqu:1,tempor:1,term:3,test:3,text:[1,2],thei:[0,1],them:3,therefor:0,thi:[3,4],throughout:0,time:1,tokenize_text:[2,3],total:4,track:3,train:0,transform:1,treat:1,treatment:1,tsarrai:1,type:[0,2,4],type_check_d:[3,4],type_distribut:4,type_inf:3,typeinform:[0,3,4],under:3,understand:0,union:4,unit:3,unittest:3,unknown:1,unless:0,updat:3,us:[0,2,4],usag:2,user:0,valid:3,valu:[0,1,2,4],version:3,versu:1,video:1,virtual:3,vocabulari:1,want:[1,3,4],we:[3,4],welcom:[1,3],went:3,were:4,what:0,where:[0,1,3,4],which:[2,3,4],whole:4,within:[0,4],word:1,work:3,workflow:3,x:3,you:1,your:[1,3]},titles:["Base
","Data types
","Helpers
","Type Infer","Infer
"],titleterms:{base:0,bug:3,can:3,code:3,commun:3,conduct:3,contribut:3,contributor:3,data:1,dev:3,environ:3,featur:3,guid:3,help:3,helper:2,how:3,infer:[3,4],instal:3,licens:3,link:3,other:3,process:3,quick:3,report:3,review:3,set:3,start:3,type:[1,3],up:3,us:3,you:3}})
\ No newline at end of file