Skip to content

Data Manipulation Dictionary for frequently used tasks in R, Python and HiveQL

License

Notifications You must be signed in to change notification settings

Datatouille/data_manipulation_dictionary

Folders and files

NameName
Last commit message
Last commit date

Latest commit

 

History

2 Commits
 
 
 
 

Repository files navigation

data_manipulation_dictionary

Data Manipulation Dictionary for frequently used tasks in R, Python and HiveQL

What to do Dplyr Pandas Hive QL
Library imports library(data.table)
library(tidyverse)
library(dplyr)
library(sqldf)
import pandas as pd
import numpy as np
NA
Create data frame df = data.frame(x=1:10, y = 1:10*2) df = pd.DataFrame({'x':range(10),'y':np.arange(10)*2}) drop table test_bounce;
create table test_bounce as
select
'A' as final_device_id ,
'FB' as NCA ,
'10' as time ,
0 as bounce_flag
Read CSV df = fread('filename.csv') df = pd.read_csv('filename.csv) drop table IF EXISTS charin.table ;
create table charin.table
(x string,
y int,
z float)
ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t'
STORED AS TEXTFILE
tblproperties("skip.header.line.count"="1");

LOAD DATA LOCAL INPATH "/mnt/drive2/drake/charin/table.tsv" OVERWRITE INTO TABLE
charin.table ;
Save to CSV write.csv(df,'filename.csv',rownames=FALSE) df.to_csv('filename.csv',index=False) hive -e': hive -e "set hive.cli.print.header=true; select * from charin.table;" > /mnt/outai_dfs/user/charinp/table.tsv
Get names rownames(df)
colnames(df)
df.index
df.columns
desc table;
Get dimensions dim(df) df.shape select count(*) from table; desc table;
Get structure str(df) df.dtypes desc table;
Get summary summary(df) df.describe() desc table;
Select #row
df['rowname',]
df[1,]
#columns
df %>% select(col1,col2)
df[,c('col1','col2')]
df[,c(1,2)]
#combination
df[1,]$col1
#row
df.loc['rowname',:]
df.iloc[1,:]
#columns
df.loc[:,['col1','col2']]
df.iloc[:,1]
#combination
df.iloc[1,df.columns.get_loc('col1')]
select x, y ,z from table;
Check NA is.na(df) df.isnull() select case when x is null then 1 else 0 end as null_flag from table;
Omit NA na.omit(df) df.drop_na() select x from table where x is not null;
Impute NA df[is.na(df)] = 0 df.fill_na(0) select case when x is null then 0 else x end as new_x from table;
View head(df) or tail(df) df.head() or df.tail() select * from table limit 10;
Add column df %>% mutate(z = 1:10) df['z'] = range(10) NA
Filter df %>% filter(z > 0) df[df.z>5] select * from table where x > 10;
Remove column df %>% select(-z) df.drop('z', axis=1) NA
Bind row or column rbind or cbind(df,df) lst = [df,df]
pd.concat(lst,axis=0 or 1)
(select * from table1) A union/union all (select * from table2) B
Remove duplicates distinct(df) df.drop_duplicated() select * from
(select
x,
row_number() over (partition by x order by time asc) as x_rank
from table) A
where x_rank = 1;
Joins inner_join(DF1, DF2, by = c('colname1', 'colname2'))
left_join (DF1, DF2, by = c('colname1', 'colname2'))
right_join(DF1, DF2, by = c('colname1', 'colname2'))
full_join (DF1, DF2, by = c('colname1', 'colname2'))
pd.merge(DF1, DF2, how='left', on=['colname1', 'colname2'])
pd.merge(DF1, DF2, how='right', on=['colname1', 'colname2'])
pd.merge(DF1, DF2, how='inner', on=['colname1', 'colname2'])
pd.merge(DF1, DF2, how='outer', on=['colname1', 'colname2'])
select * from A
inner join, left join, cross join B
on A.x = B.y;
Sort df.arrange(-x) df.sort_values(['x','y'],ascending=False) select * from table order by x desc;
Aggregation df %>% group_by(x,y) %>% summarise(z = min(z)) df.groupby(['x','y']).agg({'result1' : np.sum, 'result2' : np.mean}) select x,count(*) from table group by x;
Convert to matrix/numpy as.matrix(df) df.as_matrix() NA
Melt melt(cheese, id_vars=c('first', 'last')) pd.melt(cheese, id_vars=['first', 'last']) Very Difficult
Spread df %>% spread(Animal, FeedType) df.pivot_table(values='Amount', index='Animal', columns='FeedType', aggfunc='sum') select case when x = 'dog' then 1 else 0 end as dog_flag from table;
Apply to columns/rows exp(df$x) or exp(df[1,]) #dataframe
df.apply(np.exp, axis=0 or 1)
#series
df['x'].map(np.exp)
select exp(x) from table_name;
Apply element-wise sapply(df$x, FUN=function(x) exp(x)) df.applymap(lambda x: np.exp(x)-1) NA
Windowing df[,rank:=row_number(-x),by='y'] df['window_rank'] = df.groupby(by=['C1'])['C2'].transform(lambda x: x.rank())
df['window_rank'] = df.sort_values('col1', ascending=True).groupby(['key_col']).cumcount() + 1
select lag(NCA) over (partition by final_device_id order by time asc) as lag_NCA from table;
Reshape matrix(as.matrix(df),nrow=dim(df)[1],ncol=1) df.as_matrix().reshape(-1,1) NA
SQL-like sqldf('select * from df where x < 10') df.query('x<10') Already SQL

About

Data Manipulation Dictionary for frequently used tasks in R, Python and HiveQL

Resources

License

Stars

Watchers

Forks

Releases

No releases published

Packages

No packages published