numpy&pandas基础

numpy基础
Pandas基础

numpy基础

1	`import numpy` `as` `np`

定义array

In [156]: np.ones(3)
Out[156]: array([1., 1., 1.])
 
In [157]: np.ones((3,5))
Out[157]: 
array([[1., 1., 1., 1., 1.],
       [1., 1., 1., 1., 1.],
       [1., 1., 1., 1., 1.]])
 
In [158]: 
 
In [158]: np.zeros(4)
Out[158]: array([0., 0., 0., 0.])
 
In [159]: np.zeros((2,5))
Out[159]: 
array([[0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0.]])
 
In [160]: 
In [146]: a = np.array([[1,3,5,2],[4,2,6,1]])
 
In [147]: print(a)
[[1 3 5 2]
 [4 2 6 1]]
 
In [148]:
In [161]: np.arange(10)
Out[161]: array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])
 
In [162]: np.arange(3,13)
Out[162]: array([ 3,  4,  5,  6,  7,  8,  9, 10, 11, 12])
 
In [163]: np.arange(3,13).reshape((2,5))
Out[163]: 
array([[ 3,  4,  5,  6,  7],
       [ 8,  9, 10, 11, 12]])
 
In [164]: 
In [169]: np.arange(2,25,2)
Out[169]: array([ 2,  4,  6,  8, 10, 12, 14, 16, 18, 20, 22, 24])
 
In [170]: np.arange(2,25,2).reshape(3,4)
Out[170]: 
array([[ 2,  4,  6,  8],
       [10, 12, 14, 16],
       [18, 20, 22, 24]])
 
In [171]: 
 
In [176]:  np.linspace(1,10,4)
Out[176]: array([ 1.,  4.,  7., 10.])
 
In [177]:

array基本运算

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117

118

119

120

121

122

123

124

125

126

127

128

129

130

131

132

133

134

135

136

137

138

139

140

141

142

143

144

145

146

147

148

149

150

151

152

153

154

155

156

157

158

159

160

161

162

163

164

165

166

167

168

169

170

171

172

173

174

175

176

177

178

179

180

181

182

183

184

185

186

187

188

189

190

191

192

193

194

195

196

197

198

199

200

201

202

203

204

205

206

207

208

209

210

211

212

213

214

215

216

217

218

219

220

221

222

223

224

225

226

227

228

229

230

231

232

233

234

235

236

237

238

239

240

241

242

243

244

245

246

247

248

249

250

251

252

253

254

255

256

257

258

259

260

261

262

263

264

265

266

267

268

269

270

271

272

273

274

275

276

277

278

279

280

281

282

283

284

285

286

287

288

289

290

291

292

293

294

295

296

297

298

299

300

301

302

303

304

305

306

307

308

309

310

311

312

313

314

315

316

317

318

319

320

321

322

323

324

325

326

327

328

329

330

331

332

333

334

335

336

337

338

339

340

341

342

343

344

In [7]: a = np.array([[1,2],[3,4]])
 
In [8]: b = np.arange(5,9).reshape((2,3))
In [10]: print(a)
[[1 2]
 [3 4]]
 
In [11]: print(b)
[[5 6]
 [7 8]]
 
In [12]:
 
In [12]: a+b
Out[12]: 
array([[ 6,  8],
       [10, 12]])
 
In [13]: a-b
Out[13]: 
array([[-4, -4],
       [-4, -4]])
 
In [14]: a*b     # 对应元素相乘
Out[14]: 
array([[ 5, 12],
       [21, 32]])
 
In [17]: a/b
Out[17]: 
array([[0, 0],
       [0, 0]])
 
In [18]: 
 
In [18]: a**2
Out[18]: 
array([[ 1, 4],
[ 9, 16]])
 
In [19]:
 
 
 
In [15]: np.dot(a,b)   # 矩阵乘法
Out[15]: 
array([[19, 22],
[43, 50]])
 
In [16]: a.dot(b)
Out[16]: 
array([[19, 22],
[43, 50]])
 
In [17]:
 
 
 
In [54]: print(a)
[[ 2  3  4  5]
 [ 6  7  8  9]
 [10 11 12 13]]
 
In [55]: np.sum(a)
Out[55]: 90
 
In [56]: np.min(a)
Out[56]: 2
 
In [57]: np.max(a)
Out[57]: 13
 
In [58]: 
 
In [58]: np.sum(a,axis=1)
Out[58]: array([14, 30, 46])
 
In [59]: np.sum(a,axis=0)
Out[59]: array([18, 21, 24, 27])
 
In [60]:
 
 
 
 
 
# 三角函数结合random生成一组随机数据
In [74]: N = 10
 
In [75]: t = np.linspace(0, 2*np.pi, N)
 
In [76]: print(t)
[0.         0.6981317  1.3962634  2.0943951  2.7925268  3.4906585
 4.1887902  4.88692191 5.58505361 6.28318531]
 
In [77]: y = np.sin(t) + 0.02*np.random.randn(N)
 
In [78]: print(y)
[-0.00947902  0.64196198  0.96567468  0.89394571  0.33830193 -0.3015316
 -0.86943758 -0.95954123 -0.62526393  0.02872202]
 
In [79]: M = 3 
 
In [80]: for ii, vv in zip(np.random.rand(M)*N, np.random.randn(M)):
    ...:     y[int(ii):] += vv
    ...:     
 
In [81]: print(y)
[-0.00947902  0.64196198  1.47685437  1.55309848  0.99745469  0.35762117
 -0.21028481 -0.30038846 -0.29746375  0.35652221]
 
In [82]: 
 
 
 
 
 
In [101]: a = np.arange(2,14).reshape((3,4)) 
 
In [102]: print(a)
[[ 2  3  4  5]
 [ 6  7  8  9]
 [10 11 12 13]]
 
In [103]: print(np.argmin(a))  # 最小值的索引
0
 
In [104]: print(np.argmax(a))  # 最大值的索引
11
 
In [105]: np.cumsum(a)         # 从0元素开始的累计和
Out[105]: array([ 2,  5,  9, 14, 20, 27, 35, 44, 54, 65, 77, 90])
 
In [106]: np.cumprod(a)        # 从1元素开始的累计乘
Out[106]: 
array([         2,          6,         24,        120,        720,
             5040,      40320,     362880,    3628800,   39916800,
        479001600, 6227020800])
 
In [107]: 
In [129]: a
Out[129]: 
array([[ 2,  3,  4,  5],
       [ 6,  7,  8,  9],
       [10, 11, 12, 13]])
 
In [130]: np.cumsum(a,axis=1)
Out[130]: 
array([[ 2,  5,  9, 14],
       [ 6, 13, 21, 30],
       [10, 21, 33, 46]])
 
In [131]: np.cumsum(a,axis=0)
Out[131]: 
array([[ 2,  3,  4,  5],
       [ 8, 10, 12, 14],
       [18, 21, 24, 27]])
 
In [132]:
In [133]: np.cumprod(a,axis=1)
Out[133]: 
array([[    2,     6,    24,   120],
       [    6,    42,   336,  3024],
       [   10,   110,  1320, 17160]])
 
In [134]: np.cumprod(a,axis=0)
Out[134]: 
array([[  2,   3,   4,   5],
       [ 12,  21,  32,  45],
       [120, 231, 384, 585]])
 
In [135]: 
 
 
 
 
In [146]: a = np.array([[1,3,5,2],[4,2,6,1]])
 
In [147]: print(a)
[[1 3 5 2]
 [4 2 6 1]]
 
In [148]: a.shape
Out[148]: (2, 4)
 
In [149]: a.ndim
Out[149]: 2
 
In [150]: a.size
Out[150]: 8
 
In [151]: np.diff(a)      # 累差运算
Out[151]: 
array([[ 2,  2, -3],
       [-2,  4, -5]])
 
In [152]: np.diff(a,axis=1)
Out[152]: 
array([[ 2,  2, -3],
       [-2,  4, -5]])
 
In [153]: np.diff(a,axis=0)
Out[153]: array([[ 3, -1,  1, -1]])
 
In [154]: 
 
 
 
 
 
In [108]: a = np.array([10,7,11,9,8,13,12,9])
 
In [109]: a.ndim
Out[109]: 1
 
In [110]: a.shape
Out[110]: (8,)
 
In [111]: a.size
Out[111]: 8
 
In [112]: a.mean()      # 均值
Out[112]: 9.875
 
In [113]: a.var()       # 方差
Out[113]: 3.609375
 
In [114]: a.std()       # 标准差
Out[114]: 1.899835519196333
 
In [115]:
In [117]: np.median(a)  # 中位数
Out[117]: 9.5
 
In [118]: 
In [138]: z = (a-a.mean())/a.std()   # z-score
 
In [139]: print(z)
[ 0.06579517 -1.5132889   0.59215653 -0.46056619 -0.98692754  1.64487924
  1.11851788 -0.46056619]
 
In [140]: 
 
 
 
 
In [198]: a = np.arange(-3,3).reshape((2,3))
 
In [199]: a
Out[199]: 
array([[-3, -2, -1],
       [ 0,  1,  2]])
 
In [200]: np.nonzero(a)  # 查找非0元素
Out[200]: (array([0, 0, 0, 1, 1]), array([0, 1, 2, 1, 2]))
 
In [201]: print(np.nonzero(a))
(array([0, 0, 0, 1, 1]), array([0, 1, 2, 1, 2]))
 
In [202]: 
 
 
 
In [207]: a = np.arange(14,2,-1).reshape((3,4))
 
In [208]: print(a)
[[14 13 12 11]
 [10  9  8  7]
 [ 6  5  4  3]]
 
In [209]: np.sort(a)     # 排序
Out[209]: 
array([[11, 12, 13, 14],
       [ 7,  8,  9, 10],
       [ 3,  4,  5,  6]])
 
In [210]: 
 
In [210]: np.sort(a,axis=1)
Out[210]: 
array([[11, 12, 13, 14],
       [ 7,  8,  9, 10],
       [ 3,  4,  5,  6]])
 
In [211]: np.sort(a,axis=0)
Out[211]: 
array([[ 6,  5,  4,  3],
       [10,  9,  8,  7],
       [14, 13, 12, 11]])
 
In [212]: 
 
 
 
 
# 矩阵的转置
In [212]: a = np.arange(14,2,-1).reshape((3,4))
 
In [213]: print(a)
[[14 13 12 11]
 [10  9  8  7]
 [ 6  5  4  3]]
 
In [214]: 
 
In [215]: print(np.transpose(a))
[[14 10  6]
 [13  9  5]
 [12  8  4]
 [11  7  3]]
 
In [216]: a.T
Out[216]: 
array([[14, 10,  6],
       [13,  9,  5],
       [12,  8,  4],
       [11,  7,  3]])
 
In [217]: 
 
In [220]: a.T.dot(a)  # 先转置，再进行矩阵乘法
Out[220]: 
array([[332, 302, 272, 242],
       [302, 275, 248, 221],
       [272, 248, 224, 200],
       [242, 221, 200, 179]])
 
In [221]: 
 
 
 
# 矩阵的clip，处理最大值和最小值
In [221]: print(a)
[[14 13 12 11]
 [10  9  8  7]
 [ 6  5  4  3]]
 
In [222]: np.clip(a,5,11)
Out[222]: 
array([[11, 11, 11, 11],
       [10,  9,  8,  7],
       [ 6,  5,  5,  5]])
 
In [223]: 

卷积运算

numpy.convolve(weights,array)
  
weight = [a,b,c]
array = [i,j,k,m,n]
  
Result：[ai, bi+aj, ci+bj+ak, cj+bk+am, ck+bm+an, cm+bn, cn][N-1:-N+1]
 
针对移动平均算法来预测下一个数据，越接近待预测点的数据权重越大，
那么就需要让 i, j, k, m, n 的系数逐渐增大即可；即让 a > b > c ，并且 a+b+c=1 。
 
示例：
In [223]: weight = np.ones(3)/3
 
In [224]: print(weight)
[0.33333333 0.33333333 0.33333333]
 
In [225]: arr = np.array([8,11,9,7,10])
 
In [226]: np.convolve(weight,arr)
Out[226]: 
array([2.66666667, 6.33333333, 9.33333333, 9.        , 8.66666667,
       5.66666667, 3.33333333])
 
In [227]: 
 
In [227]: weight = np.array([0.8,0.1,0.1])
 
In [228]: np.convolve(weight,arr)
Out[228]: array([6.4, 9.6, 9.1, 7.6, 9.6, 1.7, 1. ])
 
In [229]:

random常用操作

# 生成随机浮点数，范围是在0.0~1.0之间
In [19]: a = np.random.random((2,3))
 
In [20]: print(a)
[[0.02185901 0.69585563 0.04555439]
 [0.37331857 0.32903986 0.62448246]]
 
In [21]:
 
# 生成随机整数，可指定起止范围
In [48]: np.random.randint(3)
Out[48]: 2
 
In [49]: np.random.randint(low=3,high=9)
Out[49]: 6
 
In [50]: np.random.randint(low=3,high=9,size=(3,4))
Out[50]: 
array([[5, 6, 7, 8],
       [8, 7, 3, 8],
       [5, 4, 5, 5]])
 
In [51]: 
In [68]: np.random.randint(low=-5,high=2,size=(3,4))
Out[68]: 
array([[-4, -4, -2,  1],
       [ 1,  0,  0,  1],
       [-4, -3,  1, -5]])
 
In [69]: 
 
# 生成正态分布，又名高斯分布（Gaussian distribution）随机数
In [64]: np.random.normal()
Out[64]: -0.5399414561419419
 
In [65]: np.random.normal(loc=0,scale=1,size=(2,3))
Out[65]: 
array([[-0.50318082, -0.38614219,  0.30450427],
       [ 0.41711087,  0.29990928, -0.7843322 ]])
 
In [66]:
In [66]: np.random.normal(loc=2,scale=3,size=(2,3))
Out[66]: 
array([[ 3.37067379,  6.23517315,  2.3267659 ],
       [ 6.46832646, -2.76363304,  5.77883853]])
 
In [67]:
 
# 生成标准正态分布（"standard normal" distribution）随机数，标准正态分布的平均值为0，方差为1，服从u（0，1）分布。
In [83]: np.random.randn()
Out[83]: 0.502482341264108
 
In [84]: np.random.randn(3,4)
Out[84]: 
array([[ 0.34507555, -0.26868132, -0.56103417,  0.86176617],
       [-0.16535555, -0.38045904,  0.48176385, -1.09005206],
       [-0.60780266,  1.74113117, -0.72427329, -0.51232408]])
 
In [85]:
 
# 生成[0, 1)间随机数
In [99]: np.random.rand()
Out[99]: 0.607701127768974
 
In [100]: np.random.rand(3,4)
Out[100]: 
array([[0.73020695, 0.53993878, 0.46693879, 0.82611629],
       [0.76117076, 0.16522599, 0.85129611, 0.74448772],
       [0.6450236 , 0.49994053, 0.04115063, 0.30081311]])
 
In [101]:

array索引

# 一维数组的索引和list类似
略
 
# 二维数组的索引
In [13]: import numpy as np
 
In [14]: a = np.arange(3,15).reshape((3,4))
 
In [15]: print(a)
[[ 3  4  5  6]
 [ 7  8  9 10]
 [11 12 13 14]]
 
In [16]: a[1]
Out[16]: array([ 7,  8,  9, 10])
 
In [17]: a[1,2]
Out[17]: 9
 
In [18]: a[1][2]              # 等价于 a[1,2]
Out[18]: 9
 
In [19]: 
 
In [19]: a[1,1:-1]            # 获取第二行，除去首尾元素
Out[19]: array([8, 9])
 
In [20]: a[1,1:2]             # 获取第二行第二个元素
Out[20]: array([8])
 
In [21]:
In [24]: a[1:-1,2]            # 获取第二列，除去首尾元素
Out[24]: array([9])
 
In [26]: a[:,2]               # 获取第二列元素
Out[26]: array([ 5,  9, 13])
 
In [27]:

迭代array

# 迭代行
In [27]: print(a)
[[ 3  4  5  6]
 [ 7  8  9 10]
 [11 12 13 14]]
 
In [28]: for row in a:
    ...:     print(row)
    ...:     
[3 4 5 6]
[ 7  8  9 10]
[11 12 13 14]
 
In [29]:     
 
# 迭代列
In [29]: print(a.T)
[[ 3  7 11]
 [ 4  8 12]
 [ 5  9 13]
 [ 6 10 14]]
 
In [30]: for column in a.T:
    ...:     print(column)
    ...:     
[ 3  7 11]
[ 4  8 12]
[ 5  9 13]
[ 6 10 14]
 
In [31]: 
 
 
 
 
# 二维矩阵，多行转换成一行，迭代每一个item
In [31]: print(a)
[[ 3  4  5  6]
 [ 7  8  9 10]
 [11 12 13 14]]
 
In [32]: print(a.flat)
<numpy.flatiter object at 0x7f392e3545c0>
 
In [33]: print(a.flatten())
[ 3  4  5  6  7  8  9 10 11 12 13 14]
 
In [34]: for item in a.flat:
    ...:     print(item)
    ...:     
3
4
5
6
7
8
9
10
11
12
13
14
 
In [35]:

合并array

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117

118

119

120

121

122

123

124

125

126

127

128

129

130

131

132

133

134

135

136

137

138

139

140

In [39]: a = np.array([1,2,3])
 
In [40]: b = np.array([2,2,2])
 
In [41]: c = np.vstack((a,b))     # vertical stack，上下合并
 
In [42]: print(c)
[[1 2 3]
 [2 2 2]]
 
In [43]: c.shape
Out[43]: (2, 3)
 
In [44]: c.ndim
Out[44]: 2
 
In [45]: c.size
Out[45]: 6
 
In [46]: 
 
 
 
In [47]: d = np.hstack((a,b))     # horizontal stack，左右合并
 
In [48]: print(d)
[1 2 3 2 2 2]
 
In [49]: d.shape
Out[49]: (6,)
 
In [50]: d.ndim
Out[50]: 1
 
In [51]: d.size
Out[51]: 6
 
In [52]: 
 
 
 
 
# newaxis改变数组维度
In [54]: print(a)
[1 2 3]
 
In [55]: e = a[np.newaxis,:]
 
In [56]: print(e)
[[1 2 3]]
 
In [57]: f = a[:,np.newaxis]
 
In [58]: print(f)
[[1]
 [2]
 [3]]
 
In [59]: 
 
 
 
 
In [59]: a = np.array([1,2,3])[:,np.newaxis]
 
In [60]: b = np.array([2,2,2])[:,np.newaxis]
 
In [61]: print(a)
[[1]
 [2]
 [3]]
 
In [62]: print(b)
[[2]
 [2]
 [2]]
 
In [63]: c = np.vstack((a,b))
 
In [64]: print(c)
[[1]
 [2]
 [3]
 [2]
 [2]
 [2]]
 
In [65]: d = np.hstack((a,b))        # 合并两个array
 
In [66]: print(d)
[[1 2]
 [2 2]
 [3 2]]
 
In [67]: 
In [74]: d = np.hstack((a,b,b,a))    # 合并多个array
 
In [75]: print(d)
[[1 2 2 1]
 [2 2 2 2]
 [3 2 2 3]]
 
In [76]: 
 
 
 
 
# concatenate 常用来合并多个矩阵或序列，axis可以方便的指定维度
In [76]: a = np.array([1,2,3])
 
In [77]: b = np.array([2,2,2])
 
In [78]: a = a[:,np.newaxis]
 
In [79]: b = b[:,np.newaxis]
 
In [80]: c = np.concatenate((a,b,b,a),axis=0)
 
In [81]: print(c)
[[1]
 [2]
 [3]
 [2]
 [2]
 [2]
 [2]
 [2]
 [2]
 [1]
 [2]
 [3]]
 
In [82]: c = np.concatenate((a,b,b,a),axis=1)
 
In [83]: print(c)
[[1 2 2 1]
 [2 2 2 2]
 [3 2 2 3]]
 
In [84]: 　　

分割array

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117

118

119

120

In [92]: a = np.arange(12).reshape((3,4))
 
In [93]: print(a)
[[ 0  1  2  3]
 [ 4  5  6  7]
 [ 8  9 10 11]]
 
In [94]: c = np.split(a,2,axis=1)               # 等项分割
 
In [95]: len(c)
Out[95]: 2
 
In [96]: c[0]
Out[96]: 
array([[0, 1],
       [4, 5],
       [8, 9]])
 
In [97]: c[1]
Out[97]: 
array([[ 2,  3],
       [ 6,  7],
       [10, 11]])
 
In [98]: 
 
In [98]: print(c)
[array([[0, 1],
       [4, 5],
       [8, 9]]), array([[ 2,  3],
       [ 6,  7],
       [10, 11]])]
 
In [99]: 
 
 
In [99]: d = np.array_split(a,3,axis=1)         # 不等项分割
 
In [100]: len(d)
Out[100]: 3
 
In [101]: print(d)
[array([[0, 1],
       [4, 5],
       [8, 9]]), array([[ 2],
       [ 6],
       [10]]), array([[ 3],
       [ 7],
       [11]])]
 
In [102]: d[0]
Out[102]: 
array([[0, 1],
       [4, 5],
       [8, 9]])
 
In [103]: d[1]
Out[103]: 
array([[ 2],
       [ 6],
       [10]])
 
In [104]: d[2]
Out[104]: 
array([[ 3],
       [ 7],
       [11]])
 
In [105]: 
 
 
 
 
 
In [111]: print(a)
[[ 0  1  2  3]
 [ 4  5  6  7]
 [ 8  9 10 11]]
 
In [112]: b = np.hsplit(a,2)          # horizontal split，水平分割
 
In [113]: print(b)
[array([[0, 1],
       [4, 5],
       [8, 9]]), array([[ 2,  3],
       [ 6,  7],
       [10, 11]])]
 
In [114]: b[0]
Out[114]: 
array([[0, 1],
       [4, 5],
       [8, 9]])
 
In [115]: b[1]
Out[115]: 
array([[ 2,  3],
       [ 6,  7],
       [10, 11]])
 
In [116]: 
 
In [116]: c = np.vsplit(a,3)          # vertical split，垂直分割
 
In [117]: len(c)
Out[117]: 3
 
In [118]: print(c)
[array([[0, 1, 2, 3]]), array([[4, 5, 6, 7]]), array([[ 8,  9, 10, 11]])]
 
In [119]: c[0]
Out[119]: array([[0, 1, 2, 3]])
 
In [120]: c[1]
Out[120]: array([[4, 5, 6, 7]])
 
In [121]: c[2]
Out[121]: array([[ 8,  9, 10, 11]])
 
In [122]:

Numpy.copy()

In [150]: a = np.arange(4)
 
In [151]: print(a)
[0 1 2 3]
 
In [152]: b = a
 
In [153]: b is a
Out[153]: True
 
In [154]: a[0] = 99
 
In [155]: print(b)
[99  1  2  3]
 
In [156]: 
 
In [156]: c = a.copy()      # deep copy
 
In [157]: c is a
Out[157]: False
 
In [159]: print(a)
[99  1  2  3]
 
In [160]: a[1:3] = [7,8]
 
In [161]: print(a)
[99  7  8  3]
 
In [163]: print(b)
[99  7  8  3]
 
In [164]: print(c)
[99  1  2  3]
 
In [165]:

Numpy其他

In [169]: a = np.array([-9,7,12,-4,-3,6,2])
 
In [170]: print(a)
[-9  7 12 -4 -3  6  2]
 
In [171]: np.abs(a)
Out[171]: array([ 9,  7, 12,  4,  3,  6,  2])
 
In [172]: np.where(np.abs(a)>6)
Out[172]: (array([0, 1, 2]),)
 
In [173]: 

numpy参考：http://pda.readthedocs.io/en/latest/chp4.html

回到顶部

Pandas基础

1	`import pandas` `as` `pd`

Series

In [173]: import pandas as pd
 
In [174]: import numpy as np
 
In [175]: s = pd.Series([1,3,6,np.nan,44,1])                  # 定义pandas.Series
 
In [176]: print(s)
0     1.0
1     3.0
2     6.0
3     NaN
4    44.0
5     1.0
dtype: float64
 
In [177]:

Base Time Series Frequencies

Aggragate for duplicate Indices

In [157]: dates = pd.DatetimeIndex(['1/1/2000', '1/2/2000', '1/2/2000', '1/2/2000','1/3/2000','1/3/2000'])
 
In [158]: dates
Out[158]: 
DatetimeIndex(['2000-01-01', '2000-01-02', '2000-01-02', '2000-01-02',
               '2000-01-03', '2000-01-03'],
              dtype='datetime64[ns]', freq=None)
 
In [159]: dup_ts = pd.Series(np.arange(6), index=dates)
 
In [160]: dup_ts
Out[160]: 
2000-01-01    0
2000-01-02    1
2000-01-02    2
2000-01-02    3
2000-01-03    4
2000-01-03    5
dtype: int64
 
In [161]: dup_ts.index.is_unique
Out[161]: False
 
In [162]: dup_ts['2000-01-01']
Out[162]: 0
 
In [163]: dup_ts['2000-01-02']
Out[163]: 
2000-01-02    1
2000-01-02    2
2000-01-02    3
dtype: int64
 
In [164]: dup_ts['2000-01-03']
Out[164]: 
2000-01-03    4
2000-01-03    5
dtype: int64
 
In [165]: 
 
In [165]: grouped = dup_ts.groupby(level=0)
 
In [166]: grouped.mean()
Out[166]: 
2000-01-01    0.0
2000-01-02    2.0
2000-01-03    4.5
dtype: float64
 
In [167]: grouped.count()
Out[167]: 
2000-01-01    1
2000-01-02    3
2000-01-03    2
dtype: int64
 
In [168]: grouped.sum()
Out[168]: 
2000-01-01    0
2000-01-02    6
2000-01-03    9
dtype: int64
 
In [169]:

Group by month or weekday by passing a function that accesses those fields on the time series’s index.

In [90]: rng = pd.date_range('1/1/2000', periods=100, freq='D')
 
In [91]: ts = pd.Series(np.arange(100), index=rng)
 
In [92]: ts.groupby(lambda x: x.month).mean()
Out[92]: 
1    15
2    45
3    75
4    95
dtype: int64
 
In [93]: ts.groupby(lambda x: x.month).sum()
Out[93]: 
1     465
2    1305
3    2325
4     855
dtype: int64
 
In [94]: ts.groupby(lambda x: x.month).max()
Out[94]: 
1    30
2    59
3    90
4    99
dtype: int64
 
In [95]: ts.groupby(lambda x: x.weekday).mean()
Out[95]: 
0    47.5
1    48.5
2    49.5
3    50.5
4    51.5
5    49.0
6    50.0
dtype: float64
 
In [96]: ts.groupby(lambda x: x.weekday).sum()
Out[96]: 
0    665
1    679
2    693
3    707
4    721
5    735
6    750
dtype: int64
 
In [97]:

Resample method arguments

Resampling and Frequency Conversion

In [50]: rng = pd.date_range('1/1/2000', periods=100, freq='D')
 
In [51]: ts = pd.Series(np.random.randn(len(rng)), index=rng)
 
In [52]: ts
Out[52]: 
2000-01-01    0.030631
2000-01-02   -2.087034
2000-01-03    1.238687
2000-01-04   -1.297059
2000-01-05   -1.341296
2000-01-06   -0.353311
2000-01-07   -0.854693
2000-01-08    0.426789
                ...   
2000-03-27    1.262705
2000-03-28   -0.646236
2000-03-29   -0.349658
2000-03-30   -1.093438
2000-03-31   -0.254758
2000-04-01    0.146417
2000-04-02    1.774502
2000-04-03   -0.712635
2000-04-04   -1.552352
2000-04-05    0.303172
2000-04-06   -0.023492
2000-04-07   -1.418930
2000-04-08    0.789877
2000-04-09    1.767594
Freq: D, Length: 100, dtype: float64
 
In [53]: 
 
In [53]: ts.resample('M').mean()
Out[53]: 
2000-01-31    0.003531
2000-02-29    0.030067
2000-03-31   -0.106783
2000-04-30    0.119350
Freq: M, dtype: float64
 
In [54]: ts.resample('M',kind='period').mean()
Out[54]: 
2000-01    0.003531
2000-02    0.030067
2000-03   -0.106783
2000-04    0.119350
Freq: M, dtype: float64
 
In [55]: 

Aggregate this data into five-minute chunks or bars by taking the sum of each group.

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

In [71]: rng = pd.date_range('1/1/2000', periods=24, freq='T')
 
In [72]: rng
Out[72]: 
DatetimeIndex(['2000-01-01 00:00:00', '2000-01-01 00:01:00',
               '2000-01-01 00:02:00', '2000-01-01 00:03:00',
               '2000-01-01 00:04:00', '2000-01-01 00:05:00',
               '2000-01-01 00:06:00', '2000-01-01 00:07:00',
               '2000-01-01 00:08:00', '2000-01-01 00:09:00',
               '2000-01-01 00:10:00', '2000-01-01 00:11:00',
               '2000-01-01 00:12:00', '2000-01-01 00:13:00',
               '2000-01-01 00:14:00', '2000-01-01 00:15:00',
               '2000-01-01 00:16:00', '2000-01-01 00:17:00',
               '2000-01-01 00:18:00', '2000-01-01 00:19:00',
               '2000-01-01 00:20:00', '2000-01-01 00:21:00',
               '2000-01-01 00:22:00', '2000-01-01 00:23:00'],
              dtype='datetime64[ns]', freq='T')
 
In [73]: ts = pd.Series(np.arange(24), index=rng)
 
In [74]: ts
Out[74]: 
2000-01-01 00:00:00     0
2000-01-01 00:01:00     1
2000-01-01 00:02:00     2
2000-01-01 00:03:00     3
2000-01-01 00:04:00     4
2000-01-01 00:05:00     5
2000-01-01 00:06:00     6
2000-01-01 00:07:00     7
2000-01-01 00:08:00     8
2000-01-01 00:09:00     9
2000-01-01 00:10:00    10
2000-01-01 00:11:00    11
2000-01-01 00:12:00    12
2000-01-01 00:13:00    13
2000-01-01 00:14:00    14
2000-01-01 00:15:00    15
2000-01-01 00:16:00    16
2000-01-01 00:17:00    17
2000-01-01 00:18:00    18
2000-01-01 00:19:00    19
2000-01-01 00:20:00    20
2000-01-01 00:21:00    21
2000-01-01 00:22:00    22
2000-01-01 00:23:00    23
Freq: T, dtype: int64
 
In [75]: ts.resample('5min').sum()
Out[75]: 
2000-01-01 00:00:00    10
2000-01-01 00:05:00    35
2000-01-01 00:10:00    60
2000-01-01 00:15:00    85
2000-01-01 00:20:00    86
Freq: 5T, dtype: int64
 
In [76]: ts.resample('5min',closed='left').sum()
Out[76]: 
2000-01-01 00:00:00    10
2000-01-01 00:05:00    35
2000-01-01 00:10:00    60
2000-01-01 00:15:00    85
2000-01-01 00:20:00    86
Freq: 5T, dtype: int64
 
In [77]: 
 
In [77]: ts.resample('5min').max()
Out[77]: 
2000-01-01 00:00:00     4
2000-01-01 00:05:00     9
2000-01-01 00:10:00    14
2000-01-01 00:15:00    19
2000-01-01 00:20:00    23
Freq: 5T, dtype: int64
 
In [78]: 
 
In [78]: ts.resample('5min',closed='right').sum()
Out[78]: 
1999-12-31 23:55:00     0
2000-01-01 00:00:00    15
2000-01-01 00:05:00    40
2000-01-01 00:10:00    65
2000-01-01 00:15:00    90
2000-01-01 00:20:00    66
Freq: 5T, dtype: int64
 
In [79]: 
 
In [79]: ts.resample('5min',loffset='-1s').sum()
Out[79]: 
1999-12-31 23:59:59    10
2000-01-01 00:04:59    35
2000-01-01 00:09:59    60
2000-01-01 00:14:59    85
2000-01-01 00:19:59    86
Freq: 5T, dtype: int64
 
In [80]:
 
 
 
# Open-High-Low-Close (OHLC) resampling
In [81]: ts.resample('5min').ohlc()
Out[81]: 
                     open  high  low  close
2000-01-01 00:00:00     0     4    0      4
2000-01-01 00:05:00     5     9    5      9
2000-01-01 00:10:00    10    14   10     14
2000-01-01 00:15:00    15    19   15     19
2000-01-01 00:20:00    20    23   20     23
 
In [82]:

Resampling with Periods

In [118]: frame = pd.DataFrame(np.random.randn(24, 4),
     ...:     index=pd.period_range('1-2000', '12-2001', freq='M'),
     ...:     columns=['Beijing', 'Luoyang', 'New York', 'Tokyo'])
 
In [119]: frame
Out[119]: 
          Beijing   Luoyang  New York     Tokyo
2000-01  1.120268 -1.120345 -1.154800  0.443861
2000-02  0.611443  0.200576 -1.163600 -1.137567
2000-03  0.658112  2.332235 -1.718285  1.589246
2000-04 -0.863050  1.890877  2.046202  0.410414
2000-05  0.710052 -0.041623  0.122719 -1.141112
2000-06  0.299393  1.227689  0.718627  1.004851
2000-07  1.287335 -0.179045 -0.476422  0.949235
2000-08 -2.140590  0.433699 -0.783202  1.073706
2000-09 -0.149710 -0.580780  0.755274  0.514259
2000-10  0.190940 -0.187451  1.710803 -1.631272
2000-11  0.419288  0.565235  0.470381  0.599020
2000-12  0.951111  0.464671 -0.854858 -0.009189
2001-01 -1.383493 -0.147035 -0.379006  0.472686
2001-02  1.803475 -1.628368 -0.896757 -0.508827
2001-03  0.575910 -0.528299  1.182473  0.159452
2001-04 -1.056161 -0.475357  0.861852  1.168667
2001-05 -1.316565  0.354719  1.354205 -0.369083
2001-06  0.497406 -1.799904 -0.512882 -0.092718
2001-07  0.896944 -1.276022  0.137365  0.087199
2001-08 -0.046908 -0.650024  0.958182 -0.048369
2001-09  0.085401  1.067235  0.541318  0.853376
2001-10  1.165047 -0.794425  1.137002  0.064595
2001-11 -0.438006  0.706564  1.464403  0.278069
2001-12 -0.094644  0.666789  0.220349 -0.386617
 
In [120]: frame[:5]
Out[120]: 
          Beijing   Luoyang  New York     Tokyo
2000-01  1.120268 -1.120345 -1.154800  0.443861
2000-02  0.611443  0.200576 -1.163600 -1.137567
2000-03  0.658112  2.332235 -1.718285  1.589246
2000-04 -0.863050  1.890877  2.046202  0.410414
2000-05  0.710052 -0.041623  0.122719 -1.141112
 
In [121]: annual_frame = frame.resample('A-DEC').mean()
 
In [122]: annual_frame
Out[122]: 
       Beijing   Luoyang  New York     Tokyo
2000  0.257883  0.417145 -0.027263  0.222121
2001  0.057367 -0.375344  0.505709  0.139869
 
In [123]: 
 
In [123]: annual_frame_max = frame.resample('A-DEC').max()
 
In [124]: annual_frame_max
Out[124]: 
       Beijing   Luoyang  New York     Tokyo
2000  1.287335  2.332235  2.046202  1.589246
2001  1.803475  1.067235  1.464403  1.168667
 
In [125]:

DataFrame

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117

118

119

120

121

122

123

124

125

126

127

128

129

130

131

132

133

134

135

136

137

138

139

140

141

142

143

144

145

146

147

148

149

150

151

152

153

154

155

156

157

158

159

# 第一种定义pandas.DataFrame方式：直接导入numpy的数据
In [186]: df1 = pd.DataFrame(np.arange(12).reshape((3,4)))    # 定义pandas.DataFrame
 
In [187]: print(df1)
   0  1   2   3
0  0  1   2   3
1  4  5   6   7
2  8  9  10  11
 
In [188]: 
 
In [178]: dates = pd.date_range('20160101',periods=6)
 
In [179]: print(dates)
DatetimeIndex(['2016-01-01', '2016-01-02', '2016-01-03', '2016-01-04',
               '2016-01-05', '2016-01-06'],
              dtype='datetime64[ns]', freq='D')
 
In [180]: 
 
# 定义pandas.DataFrame，并指定列名和行名
In [184]: df = pd.DataFrame(np.random.randn(6,4),index=dates,columns=['a','b','c','d'])
 
In [185]: print(df)
                   a         b         c         d
2016-01-01  1.193589  0.165348  1.598806 -0.478980
2016-01-02  1.188886 -1.232185 -0.633066  0.594805
2016-01-03  2.707996 -0.116420  1.622761  0.399708
2016-01-04  0.416469  1.593061 -0.044390 -0.031153
2016-01-05 -0.637080  1.680110  1.371026  0.821549
2016-01-06 -0.079359  1.421577  0.042537  1.058749
 
In [186]: 
 
 
# 第二种定义pandas.DataFrame方式：把参数当做字典传入DataFrame
In [188]: df2 = pd.DataFrame({'A' : 1.,
     ...:                     'B' : pd.Timestamp('20130102'),
     ...:                     'C' : pd.Series(1,index=list(range(4)),dtype='float32'),
     ...:                     'D' : np.array([3] * 4,dtype='int32'),
     ...:                     'E' : pd.Categorical(["test","train","test","train"]),
     ...:                     'F' : 'foo'})
 
In [189]: print(df2)
     A          B    C  D      E    F
0  1.0 2013-01-02  1.0  3   test  foo
1  1.0 2013-01-02  1.0  3  train  foo
2  1.0 2013-01-02  1.0  3   test  foo
3  1.0 2013-01-02  1.0  3  train  foo
 
In [190]:
In [190]: print(df2.dtypes)                      # 查看DataFrame内容的类型
A           float64
B    datetime64[ns]
C           float32
D             int32
E          category
F            object
dtype: object
 
In [191]:
In [191]: print(df2.index)                       # 打印DataFrame列的名字
Int64Index([0, 1, 2, 3], dtype='int64')
 
In [192]: 
In [192]: print(df2.columns)                     # 打印DataFrame行的名字
Index([u'A', u'B', u'C', u'D', u'E', u'F'], dtype='object')
 
In [193]: 
  
In [194]: print(df2.values)                      # 打印DataFrame的内容
[[1.0 Timestamp('2013-01-02 00:00:00') 1.0 3 'test' 'foo']
 [1.0 Timestamp('2013-01-02 00:00:00') 1.0 3 'train' 'foo']
 [1.0 Timestamp('2013-01-02 00:00:00') 1.0 3 'test' 'foo']
 [1.0 Timestamp('2013-01-02 00:00:00') 1.0 3 'train' 'foo']]
 
In [195]: 
 
 
 
 
 
In [196]: print(df2)
     A          B    C  D      E    F
0  1.0 2013-01-02  1.0  3   test  foo
1  1.0 2013-01-02  1.0  3  train  foo
2  1.0 2013-01-02  1.0  3   test  foo
3  1.0 2013-01-02  1.0  3  train  foo
 
In [197]: 
 
In [197]: print(df2.describe())                  # 打印出DataFrame的数学运算的相关数据
         A    C    D
count  4.0  4.0  4.0
mean   1.0  1.0  3.0
std    0.0  0.0  0.0
min    1.0  1.0  3.0
25%    1.0  1.0  3.0
50%    1.0  1.0  3.0
75%    1.0  1.0  3.0
max    1.0  1.0  3.0
 
In [198]: 
 
 
In [200]: print(df2.T)                           # 把DataFrame进行transport，即转置
                     0                    1                    2                    3
A                    1                    1                    1                    1
B  2013-01-02 00:00:00  2013-01-02 00:00:00  2013-01-02 00:00:00  2013-01-02 00:00:00
C                    1                    1                    1                    1
D                    3                    3                    3                    3
E                 test                train                 test                train
F                  foo                  foo                  foo                  foo
 
In [201]: 
 
 
 
 
 
 
# 对DataFrame排序
In [203]: print(df2)
     A          B    C  D      E    F
0  1.0 2013-01-02  1.0  3   test  foo
1  1.0 2013-01-02  1.0  3  train  foo
2  1.0 2013-01-02  1.0  3   test  foo
3  1.0 2013-01-02  1.0  3  train  foo
 
In [204]: df2.sort_index(axis=1, ascending=False)   # 按照index（列名）排序
Out[204]: 
     F      E  D    C          B    A
0  foo   test  3  1.0 2013-01-02  1.0
1  foo  train  3  1.0 2013-01-02  1.0
2  foo   test  3  1.0 2013-01-02  1.0
3  foo  train  3  1.0 2013-01-02  1.0
 
In [205]:
In [205]: df2.sort_index(axis=0, ascending=False)   # 按照行名排序
Out[205]: 
     A          B    C  D      E    F
3  1.0 2013-01-02  1.0  3  train  foo
2  1.0 2013-01-02  1.0  3   test  foo
1  1.0 2013-01-02  1.0  3  train  foo
0  1.0 2013-01-02  1.0  3   test  foo
 
In [206]: 
 
 
 
In [207]: df2.sort_values(by='E')                   # 指定value进行排序
Out[207]: 
     A          B    C  D      E    F
0  1.0 2013-01-02  1.0  3   test  foo
2  1.0 2013-01-02  1.0  3   test  foo
1  1.0 2013-01-02  1.0  3  train  foo
3  1.0 2013-01-02  1.0  3  train  foo
 
In [208]: 　　

Pandas筛选数据

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117

118

119

120

121

122

123

124

125

126

127

128

129

130

131

132

133

134

135

136

137

138

139

140

141

142

143

144

145

146

147

148

149

150

151

152

153

154

155

156

157

158

159

160

161

In [212]: dates = pd.date_range('20160101',periods=6)
 
In [213]: df = pd.DataFrame(np.arange(24).reshape(6,4),index=dates,columns=['A','B','C','D'])
 
In [214]: print(df)
             A   B   C   D
2016-01-01   0   1   2   3
2016-01-02   4   5   6   7
2016-01-03   8   9  10  11
2016-01-04  12  13  14  15
2016-01-05  16  17  18  19
2016-01-06  20  21  22  23
 
In [215]: 
 
In [215]: print(df['A'])            # 选取指定列
2016-01-01     0
2016-01-02     4
2016-01-03     8
2016-01-04    12
2016-01-05    16
2016-01-06    20
Freq: D, Name: A, dtype: int64
  
In [216]: print(df.A)               # 等价于 df['A']
2016-01-01     0
2016-01-02     4
2016-01-03     8
2016-01-04    12
2016-01-05    16
2016-01-06    20
Freq: D, Name: A, dtype: int64
 
In [217]:
 
In [217]: print(df[0:3])            # 切片方式选取某些行
            A  B   C   D
2016-01-01  0  1   2   3
2016-01-02  4  5   6   7
2016-01-03  8  9  10  11
 
In [218]: print(df['2016-01-01':'2016-01-03'])   # 等价于 df[0:3]
            A  B   C   D
2016-01-01  0  1   2   3
2016-01-02  4  5   6   7
2016-01-03  8  9  10  11
 
In [219]: 
 
 
 
 
# select by label : loc
In [220]: print(df.loc['2016-01-02'])
A    4
B    5
C    6
D    7
Name: 2016-01-02 00:00:00, dtype: int64
 
In [221]: 
In [221]: print(df.loc['2016-01-02']['B'])
5
 
In [222]: 
 
In [227]: print(df.loc[:,['A','B']])
             A   B
2016-01-01   0   1
2016-01-02   4   5
2016-01-03   8   9
2016-01-04  12  13
2016-01-05  16  17
2016-01-06  20  21
 
In [228]: 
In [228]: print(df.loc['2016-01-03',['A','B']])
A    8
B    9
Name: 2016-01-03 00:00:00, dtype: int64
 
In [229]: 
In [232]: print(df.loc['2016-01-03':'2016-01-05',['A','B']])
             A   B
2016-01-03   8   9
2016-01-04  12  13
2016-01-05  16  17
 
In [233]: 
 
 
 
 
# select by position : iloc
In [235]: print(df)
             A   B   C   D
2016-01-01   0   1   2   3
2016-01-02   4   5   6   7
2016-01-03   8   9  10  11
2016-01-04  12  13  14  15
2016-01-05  16  17  18  19
2016-01-06  20  21  22  23
 
In [236]: print(df.iloc[3])
A    12
B    13
C    14
D    15
Name: 2016-01-04 00:00:00, dtype: int64
 
In [237]: print(df.iloc[3,1])
13
 
In [238]: 
 
In [238]: print(df.iloc[3:5,1:3])
             B   C
2016-01-04  13  14
2016-01-05  17  18
 
In [239]: 
 
In [240]: print(df.iloc[[1,3,5],1:3])
             B   C
2016-01-02   5   6
2016-01-04  13  14
2016-01-06  21  22
 
In [241]: 
 
 
 
 
# mixed selection : ix
In [243]: print(df.ix[:3,['A','C']])
/usr/local/anaconda2/bin/ipython2:1: DeprecationWarning: 
.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing
 
See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  #!/usr/local/anaconda2/bin/python
            A   C
2016-01-01  0   2
2016-01-02  4   6
2016-01-03  8  10
 
In [244]: 
 
 
 
 
# Boolean indexing
In [9]: print(df[df.A>8])
             A   B   C   D
2016-01-04  12  13  14  15
2016-01-05  16  17  18  19
2016-01-06  20  21  22  23
 
In [10]:

df.head(n)      # 返回DataFrame前n行
 
df.tail(n)      # 返回DateFrame后n行

Pandas设置值

100

101

102

103

104

105

106

107

108

109

110

111

# 给DataFrame设置值
In [1]: import numpy as np
 
In [2]: import pandas as pd
 
In [3]: dates = pd.date_range('20160101',periods=6)
 
In [4]: df = pd.DataFrame(np.arange(24).reshape(6,4),index=dates,columns=['A','B','C','D'])
 
In [5]: print(df)
             A   B   C   D
2016-01-01   0   1   2   3
2016-01-02   4   5   6   7
2016-01-03   8   9  10  11
2016-01-04  12  13  14  15
2016-01-05  16  17  18  19
2016-01-06  20  21  22  23
 
In [6]:
In [7]: df.iloc[2,2] = 99
 
In [10]: df.loc['2016-01-02','B'] = 100
 
In [11]: print(df)
             A    B   C   D
2016-01-01   0    1   2   3
2016-01-02   4  100   6   7
2016-01-03   8    9  99  11
2016-01-04  12   13  14  15
2016-01-05  16   17  18  19
2016-01-06  20   21  22  23
 
In [12]:
 
 
 
 
In [17]: print(df)
             A   B   C   D
2016-01-01   0   1   2   3
2016-01-02   4   5   6   7
2016-01-03   8   9  10  11
2016-01-04  12  13  14  15
2016-01-05  16  17  18  19
2016-01-06  20  21  22  23
 
In [18]: df.A[df.A>4] = 0
 
In [19]: print(df)
            A   B   C   D
2016-01-01  0   1   2   3
2016-01-02  4   5   6   7
2016-01-03  0   9  10  11
2016-01-04  0  13  14  15
2016-01-05  0  17  18  19
2016-01-06  0  21  22  23
 
In [20]: 
 
 
 
In [21]: print(df)
             A   B   C   D
2016-01-01   0   1   2   3
2016-01-02   4   5   6   7
2016-01-03   8   9  10  11
2016-01-04  12  13  14  15
2016-01-05  16  17  18  19
2016-01-06  20  21  22  23
 
In [22]: df[df.A>4] = 0
 
In [23]: print(df)
            A  B  C  D
2016-01-01  0  1  2  3
2016-01-02  4  5  6  7
2016-01-03  0  0  0  0
2016-01-04  0  0  0  0
2016-01-05  0  0  0  0
2016-01-06  0  0  0  0
 
In [24]: 
 
 
 
 
In [30]: df['F'] = np.nan        # 增加一列，赋值为NaN
 
In [31]: print(df)
             A   B   C   D   F
2016-01-01   0   1   2   3 NaN
2016-01-02   4   5   6   7 NaN
2016-01-03   8   9  10  11 NaN
2016-01-04  12  13  14  15 NaN
2016-01-05  16  17  18  19 NaN
2016-01-06  20  21  22  23 NaN
 
In [32]: 
                                 # 增加一列，需要制定行名
In [46]: df['F'] = pd.Series([1,2,3,4,5,6], index=pd.date_range('20160101',periods=6)) 
 
In [47]: print(df)
             A   B   C   D   E  F
2016-01-01   0   1   2   3 NaN  1
2016-01-02   4   5   6   7 NaN  2
2016-01-03   8   9  10  11 NaN  3
2016-01-04  12  13  14  15 NaN  4
2016-01-05  16  17  18  19 NaN  5
2016-01-06  20  21  22  23 NaN  6
 
In [48]:

Pandas删除DataFrame数据

In [1]: import numpy as np
 
In [2]: import pandas as pd
 
In [3]: values = np.arange(12).reshape((3,4))
 
In [4]: print(values)
[[ 0  1  2  3]
 [ 4  5  6  7]
 [ 8  9 10 11]]
 
In [5]:
In [8]: df = pd.DataFrame(values,index=['row1','row2','row3'],columns=['A','B','C','D'])
 
In [9]: print(df)
      A  B   C   D
row1  0  1   2   3
row2  4  5   6   7
row3  8  9  10  11
 
In [10]:
In [10]: print(df.shape)
(3, 4)
 
In [11]:
In [11]: df.drop(columns='A',axis=1)
Out[11]: 
      B   C   D
row1  1   2   3
row2  5   6   7
row3  9  10  11
 
In [12]: df.drop(columns=['A','C'],axis=1)
Out[12]: 
      B   D
row1  1   3
row2  5   7
row3  9  11
 
In [13]: 
 
In [13]: df.drop(index='row2',axis=0)
Out[13]: 
      A  B   C   D
row1  0  1   2   3
row3  8  9  10  11
 
In [14]: df.drop(index=['row2','row3'],axis=0)
Out[14]: 
      A  B  C  D
row1  0  1  2  3
 
In [15]:

如果index用的是 “pd.date_range('20160101',periods=6)”

In [43]: print(df)
                   a         b         c         d
2016-01-01  1.273748  0.949407 -0.446053 -0.126789
2016-01-02 -0.770801  1.641150  0.840216 -0.991219
2016-01-03 -0.164625 -1.459954  1.214388  0.281621
2016-01-04  1.863281  1.163653  0.319549 -1.545655
2016-01-05  0.452804  0.203472 -1.232536  0.681963
2016-01-06  0.171324  0.353359  1.674004 -2.026071
 
In [44]: print(df.index)
DatetimeIndex(['2016-01-01', '2016-01-02', '2016-01-03', '2016-01-04',
               '2016-01-05', '2016-01-06'],
              dtype='datetime64[ns]', freq='D')
 
In [45]: 
 
In [45]: df.drop(index=pd.datetime(2016,1,4),axis=0)
Out[45]: 
                   a         b         c         d
2016-01-01  1.273748  0.949407 -0.446053 -0.126789
2016-01-02 -0.770801  1.641150  0.840216 -0.991219
2016-01-03 -0.164625 -1.459954  1.214388  0.281621
2016-01-05  0.452804  0.203472 -1.232536  0.681963
2016-01-06  0.171324  0.353359  1.674004 -2.026071
 
In [46]: df.drop(index=[pd.datetime(2016,1,2),pd.datetime(2016,1,5)],axis=0)
Out[46]: 
                   a         b         c         d
2016-01-01  1.273748  0.949407 -0.446053 -0.126789
2016-01-03 -0.164625 -1.459954  1.214388  0.281621
2016-01-04  1.863281  1.163653  0.319549 -1.545655
2016-01-06  0.171324  0.353359  1.674004 -2.026071
 
In [47]:

Pandas处理丢失的数据

# 处理丢失数据
 
In [7]: print(df)
             A   B   C   D
2016-01-01   0   1   2   3
2016-01-02   4   5   6   7
2016-01-03   8   9  10  11
2016-01-04  12  13  14  15
2016-01-05  16  17  18  19
2016-01-06  20  21  22  23
 
In [8]: df.iloc[0,1] = np.nan
 
In [9]: df.iloc[1,2] = np.nan
 
In [10]: print(df)
             A     B     C   D
2016-01-01   0   NaN   2.0   3
2016-01-02   4   5.0   NaN   7
2016-01-03   8   9.0  10.0  11
2016-01-04  12  13.0  14.0  15
2016-01-05  16  17.0  18.0  19
2016-01-06  20  21.0  22.0  23
 
In [11]: print(df.dropna(axis=1,how='any'))  # 删除NaN数据所在行，how = {'any','all'}
             A   D
2016-01-01   0   3
2016-01-02   4   7
2016-01-03   8  11
2016-01-04  12  15
2016-01-05  16  19
2016-01-06  20  23
 
In [12]: print(df.dropna(axis=0,how='any'))  # 删除NaN数据所在行，how = {'any','all'} 
             A     B     C   D
2016-01-03   8   9.0  10.0  11
2016-01-04  12  13.0  14.0  15
2016-01-05  16  17.0  18.0  19
2016-01-06  20  21.0  22.0  23
 
In [13]: 
In [13]: print(df.dropna(axis=0,how='all'))
             A     B     C   D
2016-01-01   0   NaN   2.0   3
2016-01-02   4   5.0   NaN   7
2016-01-03   8   9.0  10.0  11
2016-01-04  12  13.0  14.0  15
2016-01-05  16  17.0  18.0  19
2016-01-06  20  21.0  22.0  23
 
In [14]: 
In [14]: print(df.dropna(axis=1,how='all'))
             A     B     C   D
2016-01-01   0   NaN   2.0   3
2016-01-02   4   5.0   NaN   7
2016-01-03   8   9.0  10.0  11
2016-01-04  12  13.0  14.0  15
2016-01-05  16  17.0  18.0  19
2016-01-06  20  21.0  22.0  23
 
In [15]: 
 
 
 
In [15]: df.fillna(value=0)                # 把NaN填充为制定数值
Out[15]: 
             A     B     C   D
2016-01-01   0   0.0   2.0   3
2016-01-02   4   5.0   0.0   7
2016-01-03   8   9.0  10.0  11
2016-01-04  12  13.0  14.0  15
2016-01-05  16  17.0  18.0  19
2016-01-06  20  21.0  22.0  23
 
In [16]: 
 
 
 
 
 
 
In [19]: print(df.isnull())                # 把数值为NaN的位置标识出来
                A      B      C      D
2016-01-01  False   True  False  False
2016-01-02  False  False   True  False
2016-01-03  False  False  False  False
2016-01-04  False  False  False  False
2016-01-05  False  False  False  False
2016-01-06  False  False  False  False
 
In [20]: 
 
 
In [22]: print(np.any(df.isnull()) == True)   # 检查DataFrame是否含有NaN值
True
 
In [23]:

Pandas导入导出示例

In [33]: import pandas as pd
 
In [34]: data = pd.read_csv('student.csv')
 
In [35]: print(data)
    Student ID  name   age  gender
0         1100  Kelly   22  Female
1         1101    Clo   21  Female
2         1102  Tilly   22  Female
3         1103   Tony   24    Male
4         1104  David   20    Male
5         1105  Catty   22  Female
6         1106      M    3  Female
7         1107      N   43    Male
8         1108      A   13    Male
9         1109      S   12    Male
10        1110  David   33    Male
11        1111     Dw    3  Female
12        1112      Q   23    Male
13        1113      W   21  Female
 
In [36]: print(type(data))
<class 'pandas.core.frame.DataFrame'>
 
In [37]: data.to_pickle('student.pickle')
 
In [38]: data.to_json('student.json')
 
In [39]:

更多IO Tools参考：官方介绍

Pandas concat合并

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117

118

119

120

121

122

123

124

125

126

127

128

129

130

131

132

133

134

135

136

137

138

139

140

141

142

143

144

145

146

147

148

149

150

151

152

153

154

155

156

157

158

159

160

161

162

163

164

165

166

167

168

169

170

171

172

173

174

175

176

177

178

179

180

181

182

183

184

185

186

187

188

189

190

191

192

193

194

195

196

197

198

199

200

201

202

203

204

205

206

207

208

209

210

211

212

213

214

215

216

217

218

219

220

221

# pandas 合并
 
# concatenating
In [40]: import numpy as np
 
In [41]: import pandas as pd
 
In [42]: df1 = pd.DataFrame(np.ones((3,4))*0, columns=['a','b','c','d'])
 
In [43]: df2 = pd.DataFrame(np.ones((3,4))*1, columns=['a','b','c','d'])
 
In [44]: df3 = pd.DataFrame(np.ones((3,4))*2, columns=['a','b','c','d'])
 
In [45]: print(df1)
     a    b    c    d
0  0.0  0.0  0.0  0.0
1  0.0  0.0  0.0  0.0
2  0.0  0.0  0.0  0.0
 
In [46]: print(df2)
     a    b    c    d
0  1.0  1.0  1.0  1.0
1  1.0  1.0  1.0  1.0
2  1.0  1.0  1.0  1.0
 
In [47]: print(df3)
     a    b    c    d
0  2.0  2.0  2.0  2.0
1  2.0  2.0  2.0  2.0
2  2.0  2.0  2.0  2.0
 
In [48]: result = pd.concat([df1,df2,df3],axis=0)   # vertical 垂直合并
 
In [49]: print(result)
     a    b    c    d
0  0.0  0.0  0.0  0.0
1  0.0  0.0  0.0  0.0
2  0.0  0.0  0.0  0.0
0  1.0  1.0  1.0  1.0
1  1.0  1.0  1.0  1.0
2  1.0  1.0  1.0  1.0
0  2.0  2.0  2.0  2.0
1  2.0  2.0  2.0  2.0
2  2.0  2.0  2.0  2.0
 
In [50]: 
In [50]: result = pd.concat([df1,df2,df3],axis=0,ignore_index=True)  # 序号重新排列
 
In [51]: print(result)
     a    b    c    d
0  0.0  0.0  0.0  0.0
1  0.0  0.0  0.0  0.0
2  0.0  0.0  0.0  0.0
3  1.0  1.0  1.0  1.0
4  1.0  1.0  1.0  1.0
5  1.0  1.0  1.0  1.0
6  2.0  2.0  2.0  2.0
7  2.0  2.0  2.0  2.0
8  2.0  2.0  2.0  2.0
 
In [52]:
 
 
 
 
 
# join合并   ['inner','outer']
In [63]: df1 = pd.DataFrame(np.ones((3,4))*0, columns=['a','b','c','d'],index=[1,2,3])
 
In [64]: df2 = pd.DataFrame(np.ones((3,4))*1, columns=['b','c','d','e'],index=[2,3,4])
 
In [65]: print(df1)
     a    b    c    d
1  0.0  0.0  0.0  0.0
2  0.0  0.0  0.0  0.0
3  0.0  0.0  0.0  0.0
 
In [66]: print(df2)
     b    c    d    e
2  1.0  1.0  1.0  1.0
3  1.0  1.0  1.0  1.0
4  1.0  1.0  1.0  1.0
 
In [67]: 
In [67]: result = pd.concat([df1,df2])     # 即 pd.concat([df1,df2],join='outer') ， 默认就是outer模式
/usr/local/anaconda2/bin/ipython2:1: FutureWarning: Sorting because non-concatenation axis is not aligned. A future version
of pandas will change to not sort by default.
 
To accept the future behavior, pass 'sort=True'.
 
To retain the current behavior and silence the warning, pass sort=False
 
  #!/usr/local/anaconda2/bin/python
 
In [68]: 
 
In [68]: print(result)
     a    b    c    d    e
1  0.0  0.0  0.0  0.0  NaN
2  0.0  0.0  0.0  0.0  NaN
3  0.0  0.0  0.0  0.0  NaN
2  NaN  1.0  1.0  1.0  1.0
3  NaN  1.0  1.0  1.0  1.0
4  NaN  1.0  1.0  1.0  1.0
 
In [69]:
 
In [70]: result = pd.concat([df1,df2],join='inner')  # inner模式
 
In [71]: print(result)
     b    c    d
1  0.0  0.0  0.0
2  0.0  0.0  0.0
3  0.0  0.0  0.0
2  1.0  1.0  1.0
3  1.0  1.0  1.0
4  1.0  1.0  1.0
 
In [72]: 
In [72]: result = pd.concat([df1,df2],join='inner',ignore_index=True)
 
In [73]: print(result)
     b    c    d
0  0.0  0.0  0.0
1  0.0  0.0  0.0
2  0.0  0.0  0.0
3  1.0  1.0  1.0
4  1.0  1.0  1.0
5  1.0  1.0  1.0
 
In [74]: 
 
 
 
 
# join_axes合并
In [78]: res = pd.concat([df1, df2], axis=1)
 
In [79]: print(res)
     a    b    c    d    b    c    d    e
1  0.0  0.0  0.0  0.0  NaN  NaN  NaN  NaN
2  0.0  0.0  0.0  0.0  1.0  1.0  1.0  1.0
3  0.0  0.0  0.0  0.0  1.0  1.0  1.0  1.0
4  NaN  NaN  NaN  NaN  1.0  1.0  1.0  1.0
 
In [80]: 
In [74]: df1 = pd.DataFrame(np.ones((3,4))*0, columns=['a','b','c','d'],index=[1,2,3])
 
In [75]: df2 = pd.DataFrame(np.ones((3,4))*1, columns=['b','c','d','e'],index=[2,3,4])
 
In [76]: res = pd.concat([df1, df2], axis=1, join_axes=[df1.index])
 
In [77]: print(res)
     a    b    c    d    b    c    d    e
1  0.0  0.0  0.0  0.0  NaN  NaN  NaN  NaN
2  0.0  0.0  0.0  0.0  1.0  1.0  1.0  1.0
3  0.0  0.0  0.0  0.0  1.0  1.0  1.0  1.0
 
In [78]: 
In [80]: res = pd.concat([df1, df2], axis=1, join_axes=[df2.index])
 
In [81]: print(res)
     a    b    c    d    b    c    d    e
2  0.0  0.0  0.0  0.0  1.0  1.0  1.0  1.0
3  0.0  0.0  0.0  0.0  1.0  1.0  1.0  1.0
4  NaN  NaN  NaN  NaN  1.0  1.0  1.0  1.0
 
In [82]: 
 
 
 
 
# append合并
 
In [87]: df1 = pd.DataFrame(np.ones((3,4))*0, columns=['a','b','c','d'])
 
In [88]: df2 = pd.DataFrame(np.ones((3,4))*1, columns=['a','b','c','d'])
 
In [89]: df1.append(df2,ignore_index=True)
Out[89]: 
     a    b    c    d
0  0.0  0.0  0.0  0.0
1  0.0  0.0  0.0  0.0
2  0.0  0.0  0.0  0.0
3  1.0  1.0  1.0  1.0
4  1.0  1.0  1.0  1.0
5  1.0  1.0  1.0  1.0
 
In [90]: df3 = pd.DataFrame(np.ones((3,4))*1, columns=['a','b','c','d'])
 
In [91]: df1.append([df2,df3],ignore_index=True)
Out[91]: 
     a    b    c    d
0  0.0  0.0  0.0  0.0
1  0.0  0.0  0.0  0.0
2  0.0  0.0  0.0  0.0
3  1.0  1.0  1.0  1.0
4  1.0  1.0  1.0  1.0
5  1.0  1.0  1.0  1.0
6  1.0  1.0  1.0  1.0
7  1.0  1.0  1.0  1.0
8  1.0  1.0  1.0  1.0
 
In [92]: 
 
 
# 添加一行数据到DataFrame
In [92]: df1 = pd.DataFrame(np.ones((3,4))*0, columns=['a','b','c','d'])
 
In [93]: s1 = pd.Series([1,2,3,4], index=['a','b','c','d'])
 
In [94]: res = df1.append(s1,ignore_index=True)
 
In [95]: print(res)
     a    b    c    d
0  0.0  0.0  0.0  0.0
1  0.0  0.0  0.0  0.0
2  0.0  0.0  0.0  0.0
3  1.0  2.0  3.0  4.0
 
In [96]:

Pandas merge合并

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117

118

119

120

121

122

123

124

125

126

127

128

129

130

131

132

133

134

135

136

137

138

139

140

141

142

143

144

145

146

147

148

149

150

151

152

153

154

155

156

157

158

159

160

161

162

163

164

165

166

167

168

169

170

171

172

173

174

175

176

177

178

179

180

181

182

183

184

185

186

187

188

189

190

191

192

193

194

195

196

197

198

199

200

201

202

203

204

205

206

207

208

209

210

211

212

213

214

215

216

217

218

219

220

221

222

223

224

225

226

227

228

229

230

231

232

233

234

235

236

237

238

239

240

241

242

243

244

245

246

247

248

# merge合并
In [99]: import pandas as pd
 
In [100]: left = pd.DataFrame({'key': ['K0', 'K1', 'K2', 'K3'],
     ...:                      'A': ['A0', 'A1', 'A2', 'A3'],
     ...:                      'B': ['B0', 'B1', 'B2', 'B3']})
 
In [101]: right = pd.DataFrame({'key': ['K0', 'K1', 'K2', 'K3'],
     ...:                       'C': ['C0', 'C1', 'C2', 'C3'],
     ...:                       'D': ['D0', 'D1', 'D2', 'D3']})
 
In [102]: 
 
In [102]: print(left)
    A   B key
0  A0  B0  K0
1  A1  B1  K1
2  A2  B2  K2
3  A3  B3  K3
 
In [103]: print(right)
    C   D key
0  C0  D0  K0
1  C1  D1  K1
2  C2  D2  K2
3  C3  D3  K3
 
In [104]: 
In [104]: res = pd.merge(left,right,on='key')
 
In [105]: print(res)
    A   B key   C   D
0  A0  B0  K0  C0  D0
1  A1  B1  K1  C1  D1
2  A2  B2  K2  C2  D2
3  A3  B3  K3  C3  D3
 
In [106]: 
 
 
# consider two keys
In [106]: left = pd.DataFrame({'key1': ['K0', 'K0', 'K1', 'K2'],
     ...:                       'key2': ['K0', 'K1', 'K0', 'K1'],
     ...:                       'A': ['A0', 'A1', 'A2', 'A3'],
     ...:                       'B': ['B0', 'B1', 'B2', 'B3']})
 
In [107]: right = pd.DataFrame({'key1': ['K0', 'K1', 'K1', 'K2'],
     ...:                        'key2': ['K0', 'K0', 'K0', 'K0'],
     ...:                        'C': ['C0', 'C1', 'C2', 'C3'],
     ...:                        'D': ['D0', 'D1', 'D2', 'D3']})
 
In [108]: print(left)
    A   B key1 key2
0  A0  B0   K0   K0
1  A1  B1   K0   K1
2  A2  B2   K1   K0
3  A3  B3   K2   K1
 
In [109]: print(right)
    C   D key1 key2
0  C0  D0   K0   K0
1  C1  D1   K1   K0
2  C2  D2   K1   K0
3  C3  D3   K2   K0
 
In [110]: res = pd.merge(left,right,on=['key1','key2'])
 
In [111]: print(res)
    A   B key1 key2   C   D
0  A0  B0   K0   K0  C0  D0
1  A2  B2   K1   K0  C1  D1
2  A2  B2   K1   K0  C2  D2
 
 
# how={'left','right','inner','outer'}
In [112]: res = pd.merge(left,right,on=['key1','key2'],how='inner')  # 默认就是inner模式
 
In [113]: print(res)
    A   B key1 key2   C   D
0  A0  B0   K0   K0  C0  D0
1  A2  B2   K1   K0  C1  D1
2  A2  B2   K1   K0  C2  D2
 
In [114]: res = pd.merge(left,right,on=['key1','key2'],how='outer')
 
In [115]: print(res)
     A    B key1 key2    C    D
0   A0   B0   K0   K0   C0   D0
1   A1   B1   K0   K1  NaN  NaN
2   A2   B2   K1   K0   C1   D1
3   A2   B2   K1   K0   C2   D2
4   A3   B3   K2   K1  NaN  NaN
5  NaN  NaN   K2   K0   C3   D3
 
In [116]: 
In [116]: res = pd.merge(left,right,on=['key1','key2'],how='left')
 
In [117]: print(res)
    A   B key1 key2    C    D
0  A0  B0   K0   K0   C0   D0
1  A1  B1   K0   K1  NaN  NaN
2  A2  B2   K1   K0   C1   D1
3  A2  B2   K1   K0   C2   D2
4  A3  B3   K2   K1  NaN  NaN
 
In [118]: res = pd.merge(left,right,on=['key1','key2'],how='right')
 
In [119]: print(res)
     A    B key1 key2   C   D
0   A0   B0   K0   K0  C0  D0
1   A2   B2   K1   K0  C1  D1
2   A2   B2   K1   K0  C2  D2
3  NaN  NaN   K2   K0  C3  D3
 
In [120]: 
 
 
 
# indicator
In [121]: df1 = pd.DataFrame({'col1':[0,1], 'col_left':['a','b']})
 
In [122]: df2 = pd.DataFrame({'col1':[1,2,2],'col_right':[2,2,2]})
 
In [123]: print(df1)
   col1 col_left
0     0        a
1     1        b
 
In [124]: print(df2)
   col1  col_right
0     1          2
1     2          2
2     2          2
 
In [125]: res = pd.merge(df1, df2, on='col1', how='outer', indicator=True) # 给一个提示
 
In [126]: print(res)
   col1 col_left  col_right      _merge
0     0        a        NaN   left_only
1     1        b        2.0        both
2     2      NaN        2.0  right_only
3     2      NaN        2.0  right_only
 
In [127]:
In [129]: res = pd.merge(df1, df2, on='col1', how='outer', indicator='indicator_column') # 指定提示的列名
 
In [130]: print(res)
   col1 col_left  col_right indicator_column
0     0        a        NaN        left_only
1     1        b        2.0             both
2     2      NaN        2.0       right_only
3     2      NaN        2.0       right_only
 
In [131]: 
In [127]: res = pd.merge(df1, df2, on='col1', how='outer', indicator=False)
 
In [128]: print(res)
   col1 col_left  col_right
0     0        a        NaN
1     1        b        2.0
2     2      NaN        2.0
3     2      NaN        2.0
 
In [129]: 
 
 
 
 
 
 
In [131]: left = pd.DataFrame({'A': ['A0', 'A1', 'A2'],
     ...:                      'B': ['B0', 'B1', 'B2']},
     ...:                      index=['K0', 'K1', 'K2'])
 
In [132]: right = pd.DataFrame({'C': ['C0', 'C2', 'C3'],
     ...:                       'D': ['D0', 'D2', 'D3']},
     ...:                      index=['K0', 'K2', 'K3'])
 
In [133]: print(left)
     A   B
K0  A0  B0
K1  A1  B1
K2  A2  B2
 
In [134]: print(right)
     C   D
K0  C0  D0
K2  C2  D2
K3  C3  D3
 
In [135]: res = pd.merge(left, right, left_index=True, right_index=True, how='outer')
 
In [136]: print(res)
      A    B    C    D
K0   A0   B0   C0   D0
K1   A1   B1  NaN  NaN
K2   A2   B2   C2   D2
K3  NaN  NaN   C3   D3
 
In [137]: res = pd.merge(left, right, left_index=True, right_index=True, how='inner')
 
In [138]: print(res)
     A   B   C   D
K0  A0  B0  C0  D0
K2  A2  B2  C2  D2
 
In [139]: 
 
 
 
 
 
 
# handle overlapping
In [139]: boys = pd.DataFrame({'k': ['K0', 'K1', 'K2'], 'age': [1, 2, 3]})
 
In [140]: girls = pd.DataFrame({'k': ['K0', 'K0', 'K3'], 'age': [4, 5, 6]})
 
In [141]: print(boys)
   age   k
0    1  K0
1    2  K1
2    3  K2
 
In [142]: print(girls)
   age   k
0    4  K0
1    5  K0
2    6  K3
 
In [143]: res = pd.merge(boys, girls, on='k', suffixes=['_boy', '_girl'], how='inner')
 
In [144]: print(res)
   age_boy   k  age_girl
0        1  K0         4
1        1  K0         5
 
In [145]: res = pd.merge(boys, girls, on='k', suffixes=['_boy', '_girl'], how='outer')
 
In [146]: print(res)
   age_boy   k  age_girl
0      1.0  K0       4.0
1      1.0  K0       5.0
2      2.0  K1       NaN
3      3.0  K2       NaN
4      NaN  K3       6.0
 
In [147]: 　　

关于Concat 函数、Merge 函数和 Join 函数

Pandas Moving Window Functions

Pandas plot可视化

#!/usr/bin/python2.7
 
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
 
# Series
data = pd.Series(np.random.randn(1000),index=np.arange(1000))
 
data = data.cumsum()
 
 
data.plot()
plt.show()

#!/usr/bin/python2.7
 
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
 
# DataFrame
data = pd.DataFrame(np.random.randn(1000,4),\
                    index=np.arange(1000), \
                    columns=list("ABCD"))
data = data.cumsum()
# print(data.head(6))
 
data.plot()
plt.show()

#!/usr/bin/python2.7
 
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
 
# DataFrame
data = pd.DataFrame(np.random.randn(1000,4),\
                    index=np.arange(1000), \
                    columns=list("ABCD"))
data = data.cumsum()
# print(data.head(6))
 
# plot method:
#     'bar','hist','box','kde','aera','scatter','pie','hexbin'...
ax = data.plot.scatter(x='A',y='B',color='DarkBlue',label='Class AB')
data.plot.scatter(x='A',y='C',color='DarkGreen',label='Class AC',ax=ax)
plt.show()

补充：Matplotlib 3D图像

#!/usr/bin/python2.7
 
import numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
 
fig = plt.figure()
ax = Axes3D(fig)
 
# X,Y value
X = np.arange(-4,4,0.25)
Y = np.arange(-4,4,0.25)
X,Y = np.meshgrid(X,Y)
R = np.sqrt(X**2+Y**2)
 
# height value
Z = np.sin(R)
 
ax.plot_surface(X,Y,Z,rstride=1,cstride=1,cmap=plt.get_cmap('rainbow'))
 
plt.show()

#!/usr/bin/python2.7
 
import numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
 
fig = plt.figure()
ax = Axes3D(fig)
 
# X,Y value
X = np.arange(-4,4,0.25)
Y = np.arange(-4,4,0.25)
X,Y = np.meshgrid(X,Y)
R = np.sqrt(X**2+Y**2)
 
# height value
Z = np.sin(R)
 
ax.plot_surface(X,Y,Z,rstride=1,cstride=1,cmap=plt.get_cmap('rainbow'))
 
ax.contourf(X,Y,Z,zdir='z',offset=-2,cmap='rainbow') # 增加等高线
 
ax.set_zlim(-2,2)
 
plt.show()

参考：https://github.com/MorvanZhou

参考：https://morvanzhou.github.io/tutorials/

posted @ 2018-08-12 13:29 liulixin_1993 阅读(933) 评论(0) 收藏举报

刷新页面返回顶部

登录后才能查看或发表评论，立即登录或者逛逛博客园首页

阅读排行：
· .NET周刊【5月第1期 2025-05-04】
· Python 3.14 新特性盘点，更新了些什么？
· 聊聊 ruoyi-vue ,ruoyi-vue-plus ,ruoyi-vue-pro 谁才是真正的
· 物联网之对接MQTT最佳实践
· Redis 连接池耗尽的一次异常定位

历史上的今天：
2017-08-12 前端基础之JQuery

公告

2025年5月

日

一

二

三

四

五

六

青山

运动、读书、平和待人