Data Manipulation Dictionary for frequently used tasks in R, Python and HiveQL
What to do | Dplyr | Pandas | Hive QL | |
---|---|---|---|---|
Library imports | library(data.table) library(tidyverse) library(dplyr) library(sqldf) |
import pandas as pd import numpy as np |
NA | |
Create data frame | df = data.frame(x=1:10, y = 1:10*2) | df = pd.DataFrame({'x':range(10),'y':np.arange(10)*2}) | drop table test_bounce; create table test_bounce as select 'A' as final_device_id , 'FB' as NCA , '10' as time , 0 as bounce_flag |
|
Read CSV | df = fread('filename.csv') | df = pd.read_csv('filename.csv) | drop table IF EXISTS charin.table ; create table charin.table (x string, y int, z float) ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t' STORED AS TEXTFILE tblproperties("skip.header.line.count"="1"); LOAD DATA LOCAL INPATH "/mnt/drive2/drake/charin/table.tsv" OVERWRITE INTO TABLE charin.table ; |
|
Save to CSV | write.csv(df,'filename.csv',rownames=FALSE) | df.to_csv('filename.csv',index=False) | hive -e': hive -e "set hive.cli.print.header=true; select * from charin.table;" > /mnt/outai_dfs/user/charinp/table.tsv | |
Get names | rownames(df) colnames(df) |
df.index df.columns |
desc table; | |
Get dimensions | dim(df) | df.shape | select count(*) from table; desc table; | |
Get structure | str(df) | df.dtypes | desc table; | |
Get summary | summary(df) | df.describe() | desc table; | |
Select | #row df['rowname',] df[1,] #columns df %>% select(col1,col2) df[,c('col1','col2')] df[,c(1,2)] #combination df[1,]$col1 |
#row df.loc['rowname',:] df.iloc[1,:] #columns df.loc[:,['col1','col2']] df.iloc[:,1] #combination df.iloc[1,df.columns.get_loc('col1')] |
select x, y ,z from table; | |
Check NA | is.na(df) | df.isnull() | select case when x is null then 1 else 0 end as null_flag from table; | |
Omit NA | na.omit(df) | df.drop_na() | select x from table where x is not null; | |
Impute NA | df[is.na(df)] = 0 | df.fill_na(0) | select case when x is null then 0 else x end as new_x from table; | |
View | head(df) or tail(df) | df.head() or df.tail() | select * from table limit 10; | |
Add column | df %>% mutate(z = 1:10) | df['z'] = range(10) | NA | |
Filter | df %>% filter(z > 0) | df[df.z>5] | select * from table where x > 10; | |
Remove column | df %>% select(-z) | df.drop('z', axis=1) | NA | |
Bind row or column | rbind or cbind(df,df) | lst = [df,df] pd.concat(lst,axis=0 or 1) |
(select * from table1) A union/union all (select * from table2) B | |
Remove duplicates | distinct(df) | df.drop_duplicated() | select * from (select x, row_number() over (partition by x order by time asc) as x_rankfrom table) A where x_rank = 1; |
|
Joins | inner_join(DF1, DF2, by = c('colname1', 'colname2')) left_join (DF1, DF2, by = c('colname1', 'colname2')) right_join(DF1, DF2, by = c('colname1', 'colname2')) full_join (DF1, DF2, by = c('colname1', 'colname2')) |
pd.merge(DF1, DF2, how='left', on=['colname1', 'colname2']) pd.merge(DF1, DF2, how='right', on=['colname1', 'colname2']) pd.merge(DF1, DF2, how='inner', on=['colname1', 'colname2']) pd.merge(DF1, DF2, how='outer', on=['colname1', 'colname2']) |
select * from A inner join, left join, cross join B on A.x = B.y; |
|
Sort | df.arrange(-x) | df.sort_values(['x','y'],ascending=False) | select * from table order by x desc; | |
Aggregation | df %>% group_by(x,y) %>% summarise(z = min(z)) | df.groupby(['x','y']).agg({'result1' : np.sum, 'result2' : np.mean}) | select x,count(*) from table group by x; | |
Convert to matrix/numpy | as.matrix(df) | df.as_matrix() | NA | |
Melt | melt(cheese, id_vars=c('first', 'last')) | pd.melt(cheese, id_vars=['first', 'last']) | Very Difficult | |
Spread | df %>% spread(Animal, FeedType) | df.pivot_table(values='Amount', index='Animal', columns='FeedType', aggfunc='sum') | select case when x = 'dog' then 1 else 0 end as dog_flag from table; | |
Apply to columns/rows | exp(df$x) or exp(df[1,]) | #dataframe df.apply(np.exp, axis=0 or 1) #series df['x'].map(np.exp) |
select exp(x) from table_name; | |
Apply element-wise | sapply(df$x, FUN=function(x) exp(x)) | df.applymap(lambda x: np.exp(x)-1) | NA | |
Windowing | df[,rank:=row_number(-x),by='y'] | df['window_rank'] = df.groupby(by=['C1'])['C2'].transform(lambda x: x.rank()) df['window_rank'] = df.sort_values('col1', ascending=True).groupby(['key_col']).cumcount() + 1 |
select lag(NCA) over (partition by final_device_id order by time asc) as lag_NCA from table; |
|
Reshape | matrix(as.matrix(df),nrow=dim(df)[1],ncol=1) | df.as_matrix().reshape(-1,1) | NA | |
SQL-like | sqldf('select * from df where x < 10') | df.query('x<10') | Already SQL |