data_manipulation_dictionary

Data Manipulation Dictionary for frequently used tasks in R, Python and HiveQL

What to do	Dplyr	Pandas	Hive QL
Library imports	library(data.table) library(tidyverse) library(dplyr) library(sqldf)	import pandas as pd import numpy as np	NA
Create data frame	df = data.frame(x=1:10, y = 1:10*2)	df = pd.DataFrame({'x':range(10),'y':np.arange(10)*2})	drop table test_bounce; create table test_bounce as select 'A' as final_device_id , 'FB' as NCA , '10' as time , 0 as bounce_flag
Read CSV	df = fread('filename.csv')	df = pd.read_csv('filename.csv)	drop table IF EXISTS charin.table ; create table charin.table (x string, y int, z float) ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t' STORED AS TEXTFILE tblproperties("skip.header.line.count"="1"); LOAD DATA LOCAL INPATH "/mnt/drive2/drake/charin/table.tsv" OVERWRITE INTO TABLE charin.table ;
Save to CSV	write.csv(df,'filename.csv',rownames=FALSE)	df.to_csv('filename.csv',index=False)	hive -e': hive -e "set hive.cli.print.header=true; select * from charin.table;" > /mnt/outai_dfs/user/charinp/table.tsv
Get names	rownames(df) colnames(df)	df.index df.columns	desc table;
Get dimensions	dim(df)	df.shape	select count(*) from table; desc table;
Get structure	str(df)	df.dtypes	desc table;
Get summary	summary(df)	df.describe()	desc table;
Select	#row df['rowname',] df[1,] #columns df %>% select(col1,col2) df[,c('col1','col2')] df[,c(1,2)] #combination df[1,]$col1	#row df.loc['rowname',:] df.iloc[1,:] #columns df.loc[:,['col1','col2']] df.iloc[:,1] #combination df.iloc[1,df.columns.get_loc('col1')]	select x, y ,z from table;
Check NA	is.na(df)	df.isnull()	select case when x is null then 1 else 0 end as null_flag from table;
Omit NA	na.omit(df)	df.drop_na()	select x from table where x is not null;
Impute NA	df[is.na(df)] = 0	df.fill_na(0)	select case when x is null then 0 else x end as new_x from table;
View	head(df) or tail(df)	df.head() or df.tail()	select * from table limit 10;
Add column	df %>% mutate(z = 1:10)	df['z'] = range(10)	NA
Filter	df %>% filter(z > 0)	df[df.z>5]	select * from table where x > 10;
Remove column	df %>% select(-z)	df.drop('z', axis=1)	NA
Bind row or column	rbind or cbind(df,df)	lst = [df,df] pd.concat(lst,axis=0 or 1)	(select * from table1) A union/union all (select * from table2) B
Remove duplicates	distinct(df)	df.drop_duplicated()	select * from (select x, row_number() over (partition by x order by `time` asc) as x_rank from table) A where x_rank = 1;
Joins	inner_join(DF1, DF2, by = c('colname1', 'colname2')) left_join (DF1, DF2, by = c('colname1', 'colname2')) right_join(DF1, DF2, by = c('colname1', 'colname2')) full_join (DF1, DF2, by = c('colname1', 'colname2'))	pd.merge(DF1, DF2, how='left', on=['colname1', 'colname2']) pd.merge(DF1, DF2, how='right', on=['colname1', 'colname2']) pd.merge(DF1, DF2, how='inner', on=['colname1', 'colname2']) pd.merge(DF1, DF2, how='outer', on=['colname1', 'colname2'])	select * from A inner join, left join, cross join B on A.x = B.y;
Sort	df.arrange(-x)	df.sort_values(['x','y'],ascending=False)	select * from table order by x desc;
Aggregation	df %>% group_by(x,y) %>% summarise(z = min(z))	df.groupby(['x','y']).agg({'result1' : np.sum, 'result2' : np.mean})	select x,count(*) from table group by x;
Convert to matrix/numpy	as.matrix(df)	df.as_matrix()	NA
Melt	melt(cheese, id_vars=c('first', 'last'))	pd.melt(cheese, id_vars=['first', 'last'])	Very Difficult
Spread	df %>% spread(Animal, FeedType)	df.pivot_table(values='Amount', index='Animal', columns='FeedType', aggfunc='sum')	select case when x = 'dog' then 1 else 0 end as dog_flag from table;
Apply to columns/rows	exp(df$x) or exp(df[1,])	#dataframe df.apply(np.exp, axis=0 or 1) #series df['x'].map(np.exp)	select exp(x) from table_name;
Apply element-wise	sapply(df$x, FUN=function(x) exp(x))	df.applymap(lambda x: np.exp(x)-1)	NA
Windowing	df[,rank:=row_number(-x),by='y']	df['window_rank'] = df.groupby(by=['C1'])['C2'].transform(lambda x: x.rank()) df['window_rank'] = df.sort_values('col1', ascending=True).groupby(['key_col']).cumcount() + 1	select lag(NCA) over (partition by final_device_id order by `time` asc) as lag_NCA from table;
Reshape	matrix(as.matrix(df),nrow=dim(df)[1],ncol=1)	df.as_matrix().reshape(-1,1)	NA
SQL-like	sqldf('select * from df where x < 10')	df.query('x<10')	Already SQL

Name		Name	Last commit message	Last commit date
Latest commit History 2 Commits
LICENSE		LICENSE
README.md		README.md

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Repository files navigation

data_manipulation_dictionary

About

Releases

Packages

License

Datatouille/data_manipulation_dictionary

Folders and files

Latest commit

History

Repository files navigation

data_manipulation_dictionary

About

Resources

License

Stars

Watchers

Forks

Releases

Packages 0

Packages