参数传递 | Python | R |
命令行输入 | Python path/to/myscript.py arg1 arg2 arg3 | Rscript path/to/myscript.R arg1 arg2 arg3 |
脚本识别 | import sys my_args = sys.argv | myArgs <- commandArgs(trailingOnly = TRUE) |
数据传输与解析 | Python | R |
CSV(原生) | csv | read.csv |
CSV(优化) | pandas.read_csv("nba_2013.csv") | data.table::fread("nba_2013.csv") |
JSON | json(原生) | jsonlite |
YAML | PyYAML | yaml |
基本数据结构 | Python | R |
数组 | list:[1,'a'] | :array:array(c(1,"a"),2) |
Key-Value(非结构化数据) | 字典:["a":1] | lists |
数据框(结构化数据) | dataframe | data.frame |
R 中数据结构转化(plyr) | list | data frame | array |
list | llply() | ldply() | laply() |
data frame | dlply() | ddply() | daply() |
array | alply() | adply() | aaply() |
Python | R |
map | Map |
reduce | Reduce |
filter | filter |
矩阵转化 | Pyhton | R |
维度 | data.shape | dim(data) |
转为向量 | data.flatten(1) | as.vector(data) |
转为矩阵 | np.array([[1,2,3],[3,2,1]]) | matrix(c(1,2,3,3,2,1),nrow=2,byrow=T) |
转置 | data.T | t(data) |
矩阵变形 | data.reshape(1,np.prod(data.shape)) | matrix(data,ncol=nrow(data)*ncol(data)) |
矩阵按行拼接 | np.r_[A,B] | rbind(A,B) |
矩阵按列拼接 | np.c_[A,B] | cbind(A,B) |
矩阵计算 | Pyhton | R |
矩阵乘法 | np.dot(A,B) | A %*% B |
矩阵幂指 | np.power(A,3) | A^3 |
全零矩阵 | np.zeros((3,3)) | matrix(0,nrow=3,ncol=3) |
矩阵求逆 | np.linalg.inv(A) | solve(A) |
协方差 | np.cov(A,B) | cov(A,B) |
特征值 | np.linalg.eig(A)[0] | eigen(A)$values |
特征向量 | np.linalg.eig(A)[1] | eigen(A)$vectors |
数据框操作 | Python | R |
按Factor的Select操作 | df[['a', 'c']] | dt[,.(a,c),] |
按Index的Select操作 | df.iloc[:,1:2] | dt[,1:2,with=FALSE] |
按Index的Filter操作 | df[1:2] | dt[1:2] |
groupby分组操作 | df.groupby(['a','b'])[['c','d']].mean() | aggregate(x=dt[, c("v1", "v2")], by=list(mydt2$by1, mydt2$by2), FUN = mean) |
%in% 匹配操作 返回T/F | pd.Series(np.arange(5),dtype=np.float32).isin([2, 4]) | 0:4 %in% c(2,4) |
match 匹配操作 返回Index | pd.Series(pd.match(pd.Series(np.arange(5),dtype=np.float32),[2,4],np.nan)) | match(0:4, c(2,4)) |
tapply | df.pivot_table(values='a', columns='c', aggfunc=np.max) | tapply(dt$a,dt$c,max)#其中dt$a是numeric,dt$c是nominal |
查询操作 | df[df.a <= df.b] | dt[ a<=b ] |
with操作 | pd.DataFrame({'a': np.random.randn(10), 'b': np.random.randn(10)}).eval('a + b') | with(dt,a + b) |
plyr操作 | df.groupby(['month','week']).agg([np.mean, np.std]) | ddply(dt, .(month, week), summarize,mean = round(mean(x), 2),sd = round(sd(x), 2)) |
多维数组融合 | pd.DataFrame([tuple(list(x)+[val]) for x, val in np.ndenumerate(np.array(list(range(1,24))+[np.NAN]).reshape(2,3,4))]) | data.frame(melt(array(c(1:23, NA), c(2,3,4)))) |
多维列表融合 | pd.DataFrame(list(enumerate(list(range(1,5))+[np.NAN]))) | data.frame(melt(as.list(c(1:4, NA)))) |
数据框融合 | pd.melt(pd.DataFrame({'first' : ['John', 'Mary'],'last' : ['Doe', 'Bo'],'height' : [5.5, 6.0],'weight' : [130, 150]}), id_vars=['first', 'last']) | melt(data.frame(first = c('John', 'Mary'),last = c('Doe', 'Bo'),height = c(5.5, 6.0),weight = c(130, 150), id=c("first", "last")) |
数据透视表 pivot table | pd.pivot_table(pd.melt(pd.DataFrame({ 'x': np.random.uniform(1., 168., 12), 'y': np.random.uniform(7., 334., 12), 'z': np.random.uniform(1.7, 20.7, 12), 'month': [5,6,7]4, 'week': [1,2]6}), id_vars=['month', 'week']), values='value', index=['variable','week'],columns=['month'], aggfunc=np.mean) | acast(melt(data.frame(x = runif(12, 1, 168),y = runif(12, 7, 334),z = runif(12, 1.7, 20.7),month = rep(c(5,6,7),4),week = rep(c(1,2), 6)), id=c("month", "week")), week ~ month ~ variable, mean) |
连续型数值因子分类 | pd.cut(pd.Series([1,2,3,4,5,6]), 3) | cut(c(1,2,3,4,5,6), 3) |
名义型因子分类 | pd.Series([1,2,3,2,2,3]).astype("category") | factor(c(1,2,3,2,2,3)) |
(df
.groupby(['a', 'b', 'c'], as_index=False)
.agg({'d': sum, 'e': mean, 'f', np.std})
.assign(g=lambda x: x.a / x.c)
.query("g > 0.05")
.merge(df2, on='a'))
flights %>% group_by(year, month, day) %>%
select(arr_delay, dep_delay)
summarise(
arr = mean(arr_delay, na.rm = TRUE),
dep = mean(dep_delay, na.rm = TRUE)) %>%
filter(arr > 30 | dep > 30)
import seaborn as sns
import matplotlib.pyplot as plt
sns.pairplot(nba[["ast", "fg", "trb"]])
plt.show()
library(GGally)
ggpairs(nba[,c("ast", "fg", "trb")])
from sklearn.cluster import KMeans
kmeans_model = KMeans(n_clusters=5, random_state=1)
good_columns = nba._get_numeric_data().dropna(axis=1)
kmeans_model.fit(good_columns)
labels = kmeans_model.labels_
from sklearn.decomposition import PCA
pca_2 = PCA(2)
plot_columns = pca_2.fit_transform(good_columns)
plt.scatter(x=plot_columns[:,0], y=plot_columns[:,1], c=labels)
plt.show()
library(cluster)
set.seed(1)
isGoodCol <- function(col){
sum(is.na(col)) == 0 && is.numeric(col)
}
goodCols <- sapply(nba, isGoodCol)
clusters <- kmeans(nba[,goodCols], centers=5)
labels <- clusters$cluster
nba2d <- prcomp(nba[,goodCols], center=TRUE)
twoColumns <- nba2d$x[,1:2]
clusplot(twoColumns, labels)
import numpy as np
xx = np.zeros(100000000)
%timeit xx[:] = 1
The slowest run took 9.29 times longer than the fastest. This could mean that an intermediate result is being cached
1 loops, best of 3: 111 ms per loop
xx <- rep(0, 100000000)
system.time(xx[] <- 1)
user system elapsed
1.326 0.103 1.433
summary(fit)
##结果
Call:
lm(formula = ast ~ fg, data = train)
Residuals:
Min 1Q Median 3Q Max
-228.26 -35.38 -11.45 11.99 559.61
[output truncated]
import statsmodels.formula.api as sm
model = sm.ols(formula='ast ~ fga', data=train)
fitted = model.fit()
fitted.summary()
##结果
Call:
lm(formula = ast ~ fg, data = train)
Residuals:
Min 1Q Median 3Q Max
-228.26 -35.38 -11.45 11.99 559.61
[output truncated]
R在做统计分析,更加简单粗暴!毕竟R是统计科班出身嘛结论
欢迎光临 168大数据 (http://www.bi168.cn/) | Powered by Discuz! X3.2 |