NumPy高级应用
ndarray对象的内部机制
import numpy as np import pandas as pd np.ones((10,5)).shape
(10, 5)
np.ones((3,4,5),dtype=np.float64).strides
(160, 40, 8)
NumPy数据类型系统
ints = np.ones(10,dtype=np.uint16) floats = np.ones(10,dtype=np.float32) np.issubdtype(ints.dtype,np.integer)
True
np.issubdtype(floats.dtype,np.floating)
True
#调用dtype的mro查看所有父类的方法 np.float64.mro()
[numpy.float64, numpy.floating, numpy.inexact, numpy.number, numpy.generic, float, object]
高级数组操作
数组重塑
arr = np.arange(8) arr
array([0, 1, 2, 3, 4, 5, 6, 7])
arr.reshape((4,2))
array([[0, 1], [2, 3], [4, 5], [6, 7]])
arr.reshape((4,2)).reshape((2,4))
array([[0, 1, 2, 3],
[4, 5, 6, 7]])
#作为参数的形状的其中一维可以是-1,它表示该维度的大小由数据本身推断而来
arr = np.arange(15)
arr.reshape((5,-1))
array([[ 0, 1, 2],
[ 3, 4, 5],
[ 6, 7, 8],
[ 9, 10, 11],
[12, 13, 14]])
other_arr = np.ones((3,5))
other_arr.shape
(3, 5)
arr.reshape(other_arr.shape)
array([[ 0, 1, 2, 3, 4],
[ 5, 6, 7, 8, 9],
[10, 11, 12, 13, 14]])
arr = np.arange(15).reshape((5,3))
arr
array([[ 0, 1, 2],
[ 3, 4, 5],
[ 6, 7, 8],
[ 9, 10, 11],
[12, 13, 14]])
#扁平化或散开,不会产生数据的副本
arr.ravel()
array([ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14])
#会返回数据的副本
arr.flatten()
array([ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14])
C和Fortran顺序(行优先和列优先)
arr = np.arange(12).reshape((3,4))
arr
array([[ 0, 1, 2, 3],
[ 4, 5, 6, 7],
[ 8, 9, 10, 11]])
arr.ravel()
array([ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11])
arr.ravel('F')
array([ 0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11])
C/行优先顺序:先经过更高的维度(例如,轴1会先于轴0被处理) Fortran/列优先顺序:后经过更高的维度(例如,轴0会先于轴1被处理)
数组的合并和拆分
arr1 = np.array([[1,2,3],[4,5,6]])
arr2 = np.array([[7,8,9,],[10,11,12]])
np.concatenate([arr1,arr2],axis=0)
array([[ 1, 2, 3],
[ 4, 5, 6],
[ 7, 8, 9],
[10, 11, 12]])
np.concatenate([arr1,arr2],axis=1)
array([[ 1, 2, 3, 7, 8, 9],
[ 4, 5, 6, 10, 11, 12]])
#比较方便的连接操作
np.vstack((arr1,arr2))
array([[ 1, 2, 3],
[ 4, 5, 6],
[ 7, 8, 9],
[10, 11, 12]])
np.hstack((arr1,arr2))
array([[ 1, 2, 3, 7, 8, 9],
[ 4, 5, 6, 10, 11, 12]])
split用于将一个数组沿指定轴拆分为多个数组
from numpy.random import randn
arr = randn(5,2)
arr
array([[-1.15791597, 1.75726353],
[-0.58322042, -0.4999356 ],
[-0.45514448, -0.53653599],
[ 0.09353445, -0.61268623],
[ 1.48642482, 0.96240678]])
first,second,third = np.split(arr,[1,3])
first
array([[-1.15791597, 1.75726353]])
second
array([[-0.58322042, -0.4999356 ],
[-0.45514448, -0.53653599]])
third
array([[ 0.09353445, -0.61268623],
[ 1.48642482, 0.96240678]])
#有个知识点
import matplotlib.pyplot as plt
from pylab import *
img = plt.imread('数组连接函数.png')
imshow(img)
堆叠辅助类:r_和c_
arr = np.arange(6)
arr1 = arr.reshape((3,2))
arr2 = randn(3,2)
np.r_[arr1,arr2]
array([[ 0. , 1. ],
[ 2. , 3. ],
[ 4. , 5. ],
[-0.15830216, -0.17789986],
[ 0.92069385, -0.61503971],
[ 0.0215242 , -0.31915158]])
np.c_[np.r_[arr1,arr2],arr]
array([[ 0. , 1. , 0. ],
[ 2. , 3. , 1. ],
[ 4. , 5. , 2. ],
[-0.15830216, -0.17789986, 3. ],
[ 0.92069385, -0.61503971, 4. ],
[ 0.0215242 , -0.31915158, 5. ]])
#切片翻译为数组
np.c_[1:6,-10:-5]
array([[ 1, -10],
[ 2, -9],
[ 3, -8],
[ 4, -7],
[ 5, -6]])
元素的重复操作:tile和repeat
arr = np.arange(3)
arr.repeat(3)
array([0, 0, 0, 1, 1, 1, 2, 2, 2])
arr.repeat([2,3,4])
array([0, 0, 1, 1, 1, 2, 2, 2, 2])
arr = randn(2,2)
arr
array([[-1.36101126, 1.20707733],
[-2.02673963, -0.32688146]])
arr.repeat(2,axis=0)
array([[-1.36101126, 1.20707733],
[-1.36101126, 1.20707733],
[-2.02673963, -0.32688146],
[-2.02673963, -0.32688146]])
arr.repeat([2,3],axis=0)
array([[-1.36101126, 1.20707733],
[-1.36101126, 1.20707733],
[-2.02673963, -0.32688146],
[-2.02673963, -0.32688146],
[-2.02673963, -0.32688146]])
arr.repeat([2,3],axis=1)
array([[-1.36101126, -1.36101126, 1.20707733, 1.20707733, 1.20707733],
[-2.02673963, -2.02673963, -0.32688146, -0.32688146, -0.32688146]])
arr
array([[-1.36101126, 1.20707733],
[-2.02673963, -0.32688146]])
#tile的功能是沿指定轴向堆叠数组的副本。
np.tile(arr,2)
array([[-1.36101126, 1.20707733, -1.36101126, 1.20707733],
[-2.02673963, -0.32688146, -2.02673963, -0.32688146]])
np.tile(arr,(2,1))#可看成两行一列
array([[-1.36101126, 1.20707733],
[-2.02673963, -0.32688146],
[-1.36101126, 1.20707733],
[-2.02673963, -0.32688146]])
np.tile(arr,(3,2))#可看成三行两列
array([[-1.36101126, 1.20707733, -1.36101126, 1.20707733],
[-2.02673963, -0.32688146, -2.02673963, -0.32688146],
[-1.36101126, 1.20707733, -1.36101126, 1.20707733],
[-2.02673963, -0.32688146, -2.02673963, -0.32688146],
[-1.36101126, 1.20707733, -1.36101126, 1.20707733],
[-2.02673963, -0.32688146, -2.02673963, -0.32688146]])
花式索引的等价函数:take和put
arr = np.arange(10)*100
inds = [7,1,2,6]
arr[inds]
array([700, 100, 200, 600])
arr.take(inds)
array([700, 100, 200, 600])
arr.put(inds,42)
arr
array([ 0, 42, 42, 300, 400, 500, 42, 42, 800, 900])
inds = [2,0,2,1]
arr = randn(2,4)
arr
array([[-0.50161986, -0.0522166 , 0.76367148, -0.55916362],
[-1.00260667, 0.43677575, 0.59555349, 1.09068029]])
arr.take(inds,axis=1)
array([[ 0.76367148, -0.50161986, 0.76367148, -0.0522166 ],
[ 0.59555349, -1.00260667, 0.59555349, 0.43677575]])
广播
广播指的是不同形状的数组之间的算术运算的执行方式。 将标量值跟数组合并时就会发生最简单的广播。
arr = np.arange(5)
arr
array([0, 1, 2, 3, 4])
arr * 4
array([ 0, 4, 8, 12, 16])
arr = randn(4,3)
arr.mean(0)
array([ 0.20388605, 0.3803822 , -0.12810073])
demeaned = arr - arr.mean(0)
demeaned
array([[ 1.78874981, 0.15783634, -0.50319093],
[ 0.43109711, 0.46940631, -0.86418856],
[-0.96776341, -0.21361791, 0.03363107],
[-1.25208351, -0.41362474, 1.33374841]])
demeaned.mean(0)
array([ 0.00000000e+00, -2.77555756e-17, -5.55111512e-17])
arr
array([[ 1.99263586, 0.53821853, -0.63129166],
[ 0.63498315, 0.84978851, -0.99228929],
[-0.76387737, 0.16676429, -0.09446966],
[-1.04819746, -0.03324254, 1.20564768]])
row_means = arr.mean(1)
row_means.reshape((4,1))
array([[ 0.63318758],
[ 0.16416079],
[-0.23052758],
[ 0.04140256]])
demeaned = arr - row_means.reshape((4,1))
demeaned.mean(1)
array([1.48029737e-16, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00])
沿其他轴向广播
arr - arr.mean(1).reshape((4,1))
array([[ 1.35944828, -0.09496904, -1.26447924],
[ 0.47082236, 0.68562772, -1.15645008],
[-0.53334979, 0.39729187, 0.13605792],
[-1.08960002, -0.0746451 , 1.16424512]])
arr = np.zeros((4,4))
arr_3d = arr[:,np.newaxis,:]
arr_3d.shape
(4, 1, 4)
arr_1d = np.random.normal(size=3)
arr_1d[:,np.newaxis]
array([[2.04800711],
[0.52206057],
[1.07754942]])
arr_1d[np.newaxis,:]
array([[2.04800711, 0.52206057, 1.07754942]])
#如果有一个三维数组,并希望对轴2进行距平化
arr = randn(3,4,5)
depth_means = arr.mean(2)
depth_means
array([[ 0.03148651, 0.75426701, -0.09871035, -0.14189834],
[-0.13307934, 0.43572469, -0.28305591, 0.11958806],
[ 0.65974424, -0.51244655, 0.00827298, -0.7063478 ]])
demeaned = arr - depth_means[:,:,np.newaxis]
demeaned.mean(2)
array([[-1.38777878e-18, 8.88178420e-17, 0.00000000e+00,
4.44089210e-17],
[ 0.00000000e+00, 4.44089210e-17, -2.22044605e-17,
-2.22044605e-17],
[-4.44089210e-17, 8.88178420e-17, -4.44089210e-17,
4.44089210e-17]])
arr
array([[[-0.17944366, 0.26318479, -0.61798966, 0.61224075,
0.07944035],
[ 0.5355697 , 0.77165914, 2.35891609, 0.83188406,
-0.72669392],
[ 0.51017447, -0.3048777 , -0.69343153, 0.62523682,
-0.63065379],
[ 0.6120236 , 0.93732656, -0.90731587, -2.35714756,
1.00562157]],
[[-1.08873953, -0.09360078, -0.24163424, 0.88653706,
-0.1279592 ],
[-0.59340819, 1.22832916, -0.52064469, 1.46835124,
0.59599591],
[-0.6261998 , 0.29504874, -0.97687309, -0.56734596,
0.46009055],
[ 0.86167635, -1.17828692, -0.57720655, 1.86401846,
-0.37226101]],
[[ 0.7998406 , 0.33150187, 0.07888266, 1.99724594,
0.09125012],
[-0.51269232, -0.08079038, -0.69063807, 0.29238486,
-1.57049686],
[ 1.16061633, 0.34521942, -1.59255222, -0.1817776 ,
0.30985897],
[-0.77002413, 1.63996696, -1.66131826, -0.90786592,
-1.83249766]]])
def demean_axis(arr,axis=0):
means = arr.mean(axis)
#下面这些一般化的东西类似于N维的[:,:,np.newaxis]
indexer = [slice(None)]*arr.ndim
indexer[axis] = np.newaxis
return arr - means[indexer]
通过广播设置数组的值
arr = np.zeros((4,3))
arr[:] = 5
arr
array([[5., 5., 5.],
[5., 5., 5.],
[5., 5., 5.],
[5., 5., 5.]])
col = np.array([1.28,-0.42,0.44,1.6])
arr[:] = col[:,np.newaxis]
arr
array([[ 1.28, 1.28, 1.28],
[-0.42, -0.42, -0.42],
[ 0.44, 0.44, 0.44],
[ 1.6 , 1.6 , 1.6 ]])
arr[:2] = [[-1.37],[0.509]]
arr
array([[-1.37 , -1.37 , -1.37 ],
[ 0.509, 0.509, 0.509],
[ 0.44 , 0.44 , 0.44 ],
[ 1.6 , 1.6 , 1.6 ]])
ufunc高级应用
ufunc实例方法
arr = np.arange(10)
np.add.reduce(arr)
45
arr.sum()
45
arr = randn(5,5)
arr[::2].sort(1)#对部分行进行排序
arr[:,:-1] < arr[:,1:]
array([[ True, True, True, True],
[ True, False, True, True],
[ True, True, True, True],
[ True, False, False, True],
[ True, True, True, True]])
np.logical_and.reduce(arr[:,:-1]<arr[:,1:],axis=1)
array([ True, False, True, False, True])
arr = np.arange(15).reshape((3,5))
np.add.accumulate(arr,axis=1)
array([[ 0, 1, 3, 6, 10],
[ 5, 11, 18, 26, 35],
[10, 21, 33, 46, 60]], dtype=int32)
arr = np.arange(3).repeat([1,2,2])
arr
array([0, 1, 1, 2, 2])
#outer用于计算两个数组的叉积,输出结果的维度是两个输入数据的维度之和
np.multiply.outer(arr,np.arange(5))
array([[0, 0, 0, 0, 0],
[0, 1, 2, 3, 4],
[0, 1, 2, 3, 4],
[0, 2, 4, 6, 8],
[0, 2, 4, 6, 8]])
result = np.subtract.outer(randn(3,4),randn(5))
result.shape
(3, 4, 5)
arr = np.arange(10)
np.add.reduceat(arr,[0,5,8])
array([10, 18, 17], dtype=int32)
arr = np.multiply.outer(np.arange(4),np.arange(5))
arr
array([[ 0, 0, 0, 0, 0],
[ 0, 1, 2, 3, 4],
[ 0, 2, 4, 6, 8],
[ 0, 3, 6, 9, 12]])
np.add.reduceat(arr,[0,2,4],axis=1)
array([[ 0, 0, 0],
[ 1, 5, 4],
[ 2, 10, 8],
[ 3, 15, 12]], dtype=int32)
#有个知识点
import matplotlib.pyplot as plt
from pylab import *
img = plt.imread('ufunc的方法.png')
imshow(img)
自定义ufunc
def add_elements(x,y):
return x+y
#np.frompyfunc接受一个Python函数以及两个表示输入输出参数数量的整数
add_them = np.frompyfunc(add_elements,2,1)
add_them(np.arange(8),np.arange(8))
array([0, 2, 4, 6, 8, 10, 12, 14], dtype=object)
add_them = np.vectorize(add_elements,otypes=[np.float64])
add_them(np.arange(8),np.arange(8))
array([ 0., 2., 4., 6., 8., 10., 12., 14.])
arr = randn(10000)
%timeit add_them(arr,arr)
1.38 ms ± 12 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
%timeit np.add(arr,arr)
3.04 µs ± 37.3 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)
结构化和记录式数组
dtype = [('x',np.float64),('y',np.int32)]
sarr = np.array([(1.5,6),(np.pi,-2)],dtype=dtype)
sarr
array([(1.5 , 6), (3.14159265, -2)],
dtype=[('x', '<f8'), ('y', '<i4')])
sarr[0]
(1.5, 6)
sarr[0]['y']
6
#在访问结构化数组的某个字段时,返回的时该数据的视图,不会发生数据复制。
sarr['x']
array([1.5 , 3.14159265])
嵌套dtype和多维字段
dtype = [('x',np.int64,3),('y',np.int32)]
arr = np.zeros(4,dtype=dtype)
arr
array([([0, 0, 0], 0), ([0, 0, 0], 0), ([0, 0, 0], 0), ([0, 0, 0], 0)],
dtype=[('x', '<i8', (3,)), ('y', '<i4')])
#各个记录的x字段所表示的是一个长度为3的数组。
arr[0]['x']
array([0, 0, 0], dtype=int64)
arr['x']
array([[0, 0, 0],
[0, 0, 0],
[0, 0, 0],
[0, 0, 0]], dtype=int64)
dtype = [('x',[('a','f8'),('b','f4')]),('y',np.int32)]
data = np.array([((1,2),5),((3,4),6)],dtype=dtype)
data['x']
array([(1., 2.), (3., 4.)], dtype=[('a', '<f8'), ('b', '<f4')])
data['y']
array([5, 6])
data['x']['a']
array([1., 3.])
更多有关排序的话题
arr = randn(6)
arr.sort()
arr
array([-2.95073509, -0.94199533, -0.78121186, -0.36939853, -0.09198971,
0.56171152])
arr = randn(3,5)
arr
array([[ 0.22957599, -0.70340068, -0.21478486, -0.52291557, 0.56302319],
[ 0.05118732, 0.7244084 , -0.10359424, 0.09073078, -1.2389333 ],
[-1.86520752, -0.68797996, -0.10034156, -0.44550695, 1.93762706]])
arr[:,0].sort()#Sort first column values in-place
arr
array([[-1.86520752, -0.70340068, -0.21478486, -0.52291557, 0.56302319],
[ 0.05118732, 0.7244084 , -0.10359424, 0.09073078, -1.2389333 ],
[ 0.22957599, -0.68797996, -0.10034156, -0.44550695, 1.93762706]])
#numpy.sort会为原数组创建一个已排序副本
arr = randn(5)
arr
array([-0.954815 , -1.85949416, 0.19515002, -0.67403638, 1.15339208])
np.sort(arr)
array([-1.85949416, -0.954815 , -0.67403638, 0.19515002, 1.15339208])
arr
array([-0.954815 , -1.85949416, 0.19515002, -0.67403638, 1.15339208])
arr = randn(3,5)
arr
array([[ 0.33522337, -2.02572115, 0.17967668, -0.79655343, -0.95793491],
[ 1.28492436, -2.56428536, 0.02044797, 0.75447879, 1.31840572],
[ 0.30122699, -1.02623633, -1.32280617, -0.39888812, -0.47165096]])
arr.sort(axis=1)
arr
array([[-2.02572115, -0.95793491, -0.79655343, 0.17967668, 0.33522337],
[-2.56428536, 0