1."一维数组"Series
Pandas数据结构Series:基本概念及创建
s.index 、 s.values
# Series 数据结构# Series 是带有标签的一维数组,可以保存任何数据类型(整数,字符串,浮点数,Python对象等),轴标签统称为索引import numpy as np import pandas as pd >>> s = pd.Series(np.random.rand(5))>>> print(s,type(s))0 0.6103181 0.2356602 0.6064453 0.0707944 0.217530dtype: float64>>> print(s.index,type(s.index))RangeIndex(start=0, stop=5, step=1) >>> print(s.values, type(s.values))[0.61031815 0.23566007 0.60644485 0.0707941 0.21753049] >>># .index查看series索引,类型为rangeindex# .values查看series值,类型是ndarray# 核心:series相比于ndarray,是一个自带索引index的数组 → 一维数组 + 对应索引# 所以当只看series的值的时候,就是一个ndarray# series和ndarray较相似,索引切片功能差别不大# series和dict相比,series更像一个有顺序的字典(dict本身不存在顺序),其索引原理与字典相似(一个用key,一个用index)
1.1 Series 创建方法
由字典创建,字典的key就是index,values就是values
#Series 创建方法一:由字典创建,字典的key就是index,values就是values >>> dic = { 'a':1,'b':2,'c':3,'4':4,'5':5}>>> s = pd.Series(dic)>>> print(s)a 1b 2c 34 45 5dtype: int64# 注意:key肯定是字符串,假如values类型不止一个会怎么样? → dic = {'a':1 ,'b':'hello' , 'c':3, '4':4, '5':5}>>> dic = { 'a':1 ,'b':'hello' , 'c':3, '4':4, '5':5}>>> s = pd.Series(dic)>>> print(s)a 1b helloc 34 45 5dtype: object>>>
# Series 创建方法二:由数组创建(一维数组) >>> arr = np.random.randn(5)>>> s = pd.Series(arr) # 默认index是从0开始,步长为1的数字>>> print(arr)[1.08349965 0.52441811 0.76972371 0.35454797 0.39607907]>>> print(s)0 1.0835001 0.5244182 0.7697243 0.3545484 0.396079dtype: float64>>>>>> s = pd.Series(arr,index = ['a','b','c','d','e'],dtype = np.object)>>> print(s)a 1.083500b 0.524418c 0.769724d 0.354548e 0.396079dtype: object# index参数:设置index,长度保持一致# dtype参数:设置数值类型
# Series 创建方法三:由标量创建 >>> s = pd.Series(10,index = range(4))>>> print(s)0 101 102 103 10dtype: int64# 如果data是标量值,则必须提供索引。该值会重复,来匹配索引的长度
# Series 名称属性:name>>> s1 = pd.Series(np.random.randn(5))>>> print(s1)0 -0.4416271 -0.0821862 0.3794613 0.1631834 0.851316dtype: float64>>> s2 = pd.Series(np.random.randn(5),name='test')>>> print(s2)0 -0.9517561 0.0392722 0.6185963 -0.0279754 0.409068Name: test, dtype: float64>>> print(s1.name,s2.name,type(s2.name))None test# name为Series的一个参数,创建一个数组的 名称# .name方法:输出数组的名称,输出格式为str,如果没用定义输出名称,输出为None>>> s3 = s2.rename('hahaha')>>> print(s3)0 -0.9517561 0.0392722 0.6185963 -0.0279754 0.409068Name: hahaha, dtype: float64>>> print(s3.name,s2.name)hahaha test>>># .rename()重命名一个数组的名称,并且新指向一个数组,原数组不变
1.2 Series:索引
位置下标索引: s[0] 、s[-1]不错在报错哦 、s[1:4]左闭右开;
标签索引: s.['b'] 、s[ ['a', 'b', 'c' ] ] 、s['a':'c']末端包含哦 ;
布尔索引: s.isnull () s.notnull() s[s>50] s[ s.notnull() ]
# 位置下标,类似序列 >>> s = pd.Series(np.random.rand(5))>>> print(s)0 0.2338011 0.8281252 0.1849253 0.2972794 0.346561dtype: float64>>> print(s[0],type(s[0]),s[0].dtype)0.23380091830372507float64>>> print(float(s[0]),type(float(s[0])))0.23380091830372507 #print(s[-1])# 位置下标从0开始# 输出结果为numpy.float格式,# 可以通过float()函数转换为python float格式# numpy.float与float占用字节不同# s[-1]结果如何? 会报错
# 标签索引 >>> s = pd.Series(np.random.rand(5),index=['a','b','c','d','e'])>>> print(s)a 0.685577b 0.998041c 0.451358d 0.832554e 0.090653dtype: float64>>> print(s['a'],type(s['a']),s['a'].dtype)0.6855772922411842float64# 方法类似下标索引,用[]表示,内写上index,注意index是字符串>>> sci = s[['a','b','e']]>>> print(sci,type(sci))a 0.685577b 0.998041e 0.090653dtype: float64 >>># 如果需要选择多个标签的值,用[[]]来表示(相当于[]中包含一个列表)# 多标签索引结果是新的数组
# 切片索引 >>> s1 = pd.Series(np.random.rand(5))>>> s2 = pd.Series(np.random.rand(5),index=['a','b','c','d','e'])>>> print(s1,'\n',s2)0 0.9176531 0.7631792 0.8378073 0.3444354 0.360922dtype: float64a 0.126537b 0.699155c 0.289233d 0.831209e 0.273572dtype: float64>>>>>> print(s1[1:4],s1[4]) #左闭右开1 0.7631792 0.8378073 0.344435dtype: float64 0.36092197040034457>>> print(s2['a':'c'],s2['c']) #用index做切片末端是包含的 a 0.126537b 0.699155c 0.289233dtype: float64 0.28923306798234194>>> print(s2[0:3],s2[3])a 0.126537b 0.699155c 0.289233dtype: float64 0.8312088483742163 # 注意:用index做切片是末端包含 >>> print(s2[:-1])a 0.126537b 0.699155c 0.289233d 0.831209 ##不包含末端的e dtype: float64>>> print(s2[::2])a 0.126537c 0.289233e 0.273572dtype: float64 # 下标索引做切片,和list写法一样
# 布尔型索引 >>> s = pd.Series(np.random.rand(3)*100)>>> s[4] = None>>> print(s)0 19.95151 59.91332 97.98544 Nonedtype: object>>> bs1 = s > 50>>> bs2 = s.isnull()>>> bs3 = s.notnull() >>> print(bs1, type(bs1),bs1.dtype)0 False1 True2 True4 Falsedtype: boolbool>>> print(bs2, type(bs2),bs2.dtype)0 False1 False2 False4 Truedtype: bool bool>>> print(bs3, type(bs3),bs3.dtype)0 True1 True2 True4 Falsedtype: bool bool>>># 数组做判断之后,返回的是一个由布尔值组成的新的数组# .isnull() / .notnull() 判断是否为空值 (None代表空值,NaN代表有问题的数值,两个都会识别为空值)>>> print(s[s > 50])1 59.91332 97.9854dtype: object>>> print(s[bs3])0 19.95151 59.91332 97.9854dtype: object>>># 布尔型索引方法:用[判断条件]表示,其中判断条件可以是 一个语句,或者是 一个布尔型数组!
1.3 Series:基本技巧
数据查看(.head() .tail() ) / 重新索引就是对index做重新排序(reindex(列表)) / 对齐 ( s1+s2 )/ 添加(s1.append(s2))、 修改s['a']=10 、 删除值s.drop('a')
# 数据查看>>> s = pd.Series(np.random.rand(50))>>> print(s.head(10))0 0.2824751 0.0121532 0.6424873 0.9065134 0.1957095 0.8285066 0.1946327 0.1971388 0.5035669 0.897846dtype: float64>>> print(s.tail())45 0.96391646 0.64268847 0.86584048 0.83574649 0.905786dtype: float64# .head()查看头部数据# .tail()查看尾部数据# 默认查看5条
# 重新索引reindex# .reindex将会根据索引重新排序,如果当前索引不存在,则引入缺失值>>> s = pd.Series(np.random.rand(3),index=['a','b','c'])>>> print(s)a 0.239126b 0.862137c 0.501479dtype: float64>>> s1 = s.reindex(['c','b','a','d'])>>> print(s1)c 0.501479b 0.862137a 0.239126d NaNdtype: float64# .reindex()中也是写列表# 这里'd'索引不存在,所以值为NaN>>> s2 = s.reindex(['c','b','a','d'],fill_value=0) # fill_value参数:填充缺失值的值
>>> print(s2) c 0.501479 b 0.862137 a 0.239126 d 0.000000 dtype: float64
# Series对齐>>> s1 = pd.Series(np.random.rand(3),index=['Jack','Marry','Kris'])>>> s2 = pd.Series(np.random.rand(3),index=['Wang','Jack','Marry'])>>> print(s1)Jack 0.583406Marry 0.603579Kris 0.812511dtype: float64>>> print(s2)Wang 0.582852Jack 0.975184Marry 0.990203dtype: float64>>> print(s1+s2)Jack 1.558589Kris NaNMarry 1.593783Wang NaNdtype: float64# Series 和 ndarray 之间的主要区别是,Series 上的操作会根据标签自动对齐# index顺序不会影响数值计算,以标签来计算# 空值和任何值计算结果扔为空值
# 删除:.drop>>> s = pd.Series(np.random.rand(5),index=list('ngjur'))>>> print(s)n 0.239752g 0.643085j 0.313229u 0.231923r 0.836070dtype: float64>>> s1 = s.drop('n')>>> print(s1)g 0.643085j 0.313229u 0.231923r 0.836070dtype: float64>>> s2 = s.drop(['g','j'])>>> print(s2)n 0.239752u 0.231923r 0.836070dtype: float64>>> print(s)n 0.239752g 0.643085j 0.313229u 0.231923r 0.836070dtype: float64# drop 删除元素之后返回副本(inplace=False)
# 添加 >>> s1 = pd.Series(np.random.rand(5))>>> s2 = pd.Series(np.random.rand(5),index=list('ngjur'))>>> print(s1,'\n',s2)0 0.4172491 0.2266552 0.7980183 0.9843984 0.304693dtype: float64n 0.354443g 0.609306j 0.103994u 0.392755r 0.302959dtype: float64>>> s1[5] = 100>>> s2['a'] = 100>>> print(s1,'\n',s2)0 0.4172491 0.2266552 0.7980183 0.9843984 0.3046935 100.000000dtype: float64n 0.354443g 0.609306j 0.103994u 0.392755r 0.302959a 100.000000dtype: float64# 直接通过下标索引/标签index添加值 >>> s3 = s1.append(s2)>>> print(s3,'\n',s1)0 0.4172491 0.2266552 0.7980183 0.9843984 0.3046935 100.000000n 0.354443g 0.609306j 0.103994u 0.392755r 0.302959a 100.000000dtype: float64 0 0.4172491 0.2266552 0.7980183 0.9843984 0.3046935 100.000000dtype: float64# 通过.append方法,直接添加一个数组# .append方法生成一个新的数组,不改变之前的数组
# 修改 >>> s = pd.Series(np.random.rand(3),index=['a','b','c'])>>> print(s)a 0.246992b 0.349735c 0.395859dtype: float64>>> s['a'] = 100>>> s[['b','c']] = 200>>> print(s)a 100.0b 200.0c 200.0dtype: float64>>># 通过索引直接修改,类似序列
2. Pandas数据结构Dataframe
2.1 基本概念及创建
"二维数组"Dataframe:是一个表格型的数据结构,包含一组有序的列,其列的值类型可以是数值、字符串、布尔值等。
Dataframe中的数据以一个或多个二维块存放,不是列表、字典或一维数组结构。
# Dataframe 数据结构# Dataframe是一个表格型的数据结构,“带有标签的二维数组”。# Dataframe带有index(行标签)和columns(列标签)>>> data = { 'name':['Jack','Tom','Marry'],... 'age':[18,19,20],... 'gender':['m','m','w']}>>> frame = pd.DataFrame(data)>>> print(frame) name age gender0 Jack 18 m1 Tom 19 m2 Marry 20 w>>> print(type(frame))>>> print(frame.index,'\n该数据类型为:',type(frame.index))RangeIndex(start=0, stop=3, step=1)该数据类型为: >>> print(frame.columns,'\n该数据类型为:',type(frame.columns))Index(['name', 'age', 'gender'], dtype='object')该数据类型为: >>> print(frame.values,'\n该数据类型为:',type(frame.values))[['Jack' 18 'm'] ['Tom' 19 'm'] ['Marry' 20 'w']]该数据类型为: # 查看数据,数据类型为dataframe# .index查看行标签# .columns查看列标签# .values查看值,数据类型为ndarray
# Dataframe 创建方法一:由数组/list组成的字典 # 创建方法:pandas.Dataframe()>>> data1 = { 'a':[1,2,3], 'b':[3,4,5], 'c':[5,6,7]}>>> data2 = { 'one':np.random.rand(3),'two':np.random.rand(3)} ## 这里如果尝试 'two':np.random.rand(4) 会怎么样?转为DataFrame会报错--> {'one': array([0.938673 , 0.90796881, 0.8890414 ]), 'two': array([0.37261493, 0.70430298, 0.24494145, 0.3924875 ])},转为DataFrame 则 ValueError: arrays must all be same length >>> print(data1,'\n',data2){ 'a': [1, 2, 3], 'b': [3, 4, 5], 'c': [5, 6, 7]} {'one': array([0.76701471, 0.01005053, 0.09453216]), 'two': array([0.58442534, 0.14610703, 0.03588291]))
>>> df1 = pd.DataFrame(data1)>>> df2 = pd.DataFrame(data2)>>> print(df1,'\n',df2) a b c0 1 3 51 2 4 62 3 5 7 one two0 0.767015 0.5730351 0.010051 0.8926242 0.094532 0.228811>>># 由数组/list组成的字典 创建Dataframe,columns为字典key,index为默认数字标签# 字典的值的长度必须保持一致!>>> df1 = pd.DataFrame(data1,columns=['b','c','a','d'])>>> print(df1) b c a d0 3 5 1 NaN1 4 6 2 NaN2 5 7 3 NaN>>> df1 = pd.DataFrame(data1,columns=['b','c'])>>> print(df1) b c0 3 51 4 62 5 7# columns参数:可以重新指定列的顺序,格式为list,如果现有数据中没有该列(比如'd'),则产生NaN值# 如果columns重新指定时候,列的数量可以少于原数据>>> df2 = pd.DataFrame(data2,index=['f1','f2','f3']) # 这里如果尝试 index = ['f1','f2','f3','f4'] 会怎么样?长度不一致,报错>>> print(df2) one twof1 0.767015 0.573035f2 0.010051 0.892624f3 0.094532 0.228811>>># index参数:重新定义index,格式为list,长度必须保持一致
# Dataframe 创建方法二:由Series组成的字典 >>> data1 = { 'one':pd.Series(np.random.rand(2)),'two':pd.Series(np.random.rand(3))} # 没有设置index的Series>>> data2 = { 'one':pd.Series(np.random.rand(2),index=['a','b']),'two':pd.Series(np.random.rand(3),index=['a','b','c'])} # 设置了index的Series>>> print(data1,'\n',data2){ 'one': 0 0.6824551 0.282592dtype: float64, 'two': 0 0.9950541 0.7815872 0.959304dtype: float64} { 'one': a 0.940915b 0.792245dtype: float64, 'two': a 0.609878b 0.910182c 0.245590dtype: float64}>>> df1 = pd.DataFrame(data1)>>> df2 = pd.DataFrame(data2)>>> print(df1) one two0 0.682455 0.9950541 0.282592 0.7815872 NaN 0.959304>>> print(df2) one twoa 0.940915 0.609878b 0.792245 0.910182c NaN 0.245590>>># 由Seris组成的字典 创建Dataframe,columns为字典key,index为Series的标签(如果Series没有指定标签,则是默认数字标签)# Series可以长度不一样,生成的Dataframe会出现NaN值
# Dataframe 创建方法三:通过二维数组直接创建 >>> ar = np.random.rand(9).reshape(3,3)>>> print(ar)[[0.43760945 0.3563898 0.16767573] [0.26565413 0.61673585 0.54037501] [0.95541978 0.05395517 0.02045977]]>>> df1 = pd.DataFrame(ar)>>> df2 = pd.DataFrame(ar,index=['a','b','c'],columns=['one','two','three'])>>> print(df1,'\n',df2) 0 1 20 0.437609 0.356390 0.1676761 0.265654 0.616736 0.5403752 0.955420 0.053955 0.020460 one two threea 0.437609 0.356390 0.167676b 0.265654 0.616736 0.540375c 0.955420 0.053955 0.020460>>># 通过二维数组直接创建Dataframe,得到一样形状的结果数据,如果不指定index和columns,两者均返回默认数字格式# index和colunms指定长度与原数组保持一致
# Dataframe 创建方法四:由字典组成的列表 >>> data = [{ 'one':1,'two':2},{ 'one':5,'two':10,'three':20}]>>> print(data)[{ 'one': 1, 'two': 2}, { 'one': 5, 'two': 10, 'three': 20}]>>> df1 = pd.DataFrame(data)>>> df2 = pd.DataFrame(data,index = ['a','b'])>>> df3 = pd.DataFrame(data,columns = ['one','two'])>>> print(df1,'\n',df2,'\n',df3) one three two0 1 NaN 21 5 20.0 10 one three twoa 1 NaN 2b 5 20.0 10 one two0 1 21 5 10>>># 由字典组成的列表创建Dataframe,columns为字典的key,index不做指定则为默认数组标签# colunms和index参数分别重新指定相应列及行标签
# Dataframe 创建方法五:由字典组成的字典 data = { 'Jack':{ 'math':90,'english':89,'art':78}, 'Marry':{ 'math':82,'english':95,'art':92}, 'Tom':{ 'math':78,'english':67}}df1 = pd.DataFrame(data)print(df1)# 由字典组成的字典创建Dataframe,columns为字典的key,index为子字典的keydf2 = pd.DataFrame(data, columns = ['Jack','Tom','Bob'])df3 = pd.DataFrame(data, index = ['a','b','c'])print(df2)print(df3)# columns参数可以增加和减少现有列,如出现新的列,值为NaN# index在这里和之前不同,并不能改变原有index,如果指向新的标签,值为NaN (非常重要!)#在cmd或pycharm里边报错。AttributeError: 'list' object has no attribute 'astype' Jack Marry Tomart 78 92 NaNenglish 89 95 67.0math 90 82 78.0 Jack Tom Bobart 78 NaN NaNenglish 89 67.0 NaNmath 90 78.0 NaN Jack Marry Toma NaN NaN NaNb NaN NaN NaNc NaN NaN NaN
2.2 Dataframe:索引
Dataframe既有行索引也有列索引,可以被看做由Series组成的字典(共用一个索引)
选择列 / 选择行 / 切片 / 布尔判断
df [ 'a' ] df [ ['a', 'b'] ] 选择列 、 df.loc [ 'one' ] 按index选择行
#选择行df.loc[] 与列 df[ ] >>> df = pd.DataFrame(np.random.rand(12).reshape(3,4)*100,index=['one','two','three'],columns=['a','b','c','d'])>>> print(df) a b c done 15.715854 86.084608 22.376152 66.760504two 3.761389 63.610935 85.752549 19.065568three 77.277233 24.776938 13.159774 46.518796>>> data1 = df['a']>>> data2 = df[['a','c']]>>> print(data1,type(data1))one 15.715854two 3.761389three 77.277233Name: a, dtype: float64>>> print(data2,type(data2)) a cone 15.715854 22.376152two 3.761389 85.752549three 77.277233 13.159774 >>># 按照列名选择列,只选择一列输出Series,选择多列输出Dataframe >>> data3 = df.loc['one']>>> data4 = df.loc[['one','two']]>>> print(data3,type(data3))a 15.715854b 86.084608c 22.376152d 66.760504Name: one, dtype: float64 >>> print(data4,type(data4)) a b c done 15.715854 86.084608 22.376152 66.760504two 3.761389 63.610935 85.752549 19.065568 >>># 按照index选择行,只选择一行输出Series,选择多行输出Dataframe
2.2.1 df[ ] -- 选择列
#1. df[] - 选择列 # 一般用于选择列,也可以选择行>>> df = pd.DataFrame(np.random.rand(12).reshape(3,4)*100,index=['one','two','three'],columns=['a','b','c','d'])>>> print(df) a b c done 94.536247 33.478780 10.738060 52.679418two 37.573186 95.915130 8.529743 11.367094three 80.758763 0.000355 36.136580 95.739389>>> data1 = df['a']>>> data2 = df[['b','c']] # 尝试输入 data2 = df[['b','c','e']]会报错>>> print(data1)one 94.536247two 37.573186three 80.758763Name: a, dtype: float64>>> print(data2) b cone 33.478780 10.738060two 95.915130 8.529743three 0.000355 36.136580>>># df[]默认选择列,[]中写列名(所以一般数据colunms都会单独制定,不会用默认数字列名,以免和index冲突)# 单选列为Series,print结果为Series格式# 多选列为Dataframe,print结果为Dataframe格式>>> data3 = df[:1]#data3 = df[0] #这两种都是错误的,0 'one'#data3 = df['one']>>> print(data3,type(data3)) a b c done 94.536247 33.47878 10.73806 52.679418# df[]中为数字时,默认选择行,且只能进行切片的选择,不能单独选择(df[0])# 输出结果为Dataframe,即便只选择一行# df[]不能通过索引标签名来选择行(df['one'])# 核心笔记:df[col]一般用于选择列,[]中写列名
2.2.2df.loc[ ] - 按index选择行
#2. df.loc[] - 按index选择行 >>> df1 = pd.DataFrame(np.random.rand(16).reshape(4,4)*100,index=['one','two','three','four'],columns=['a','b','c','d'])>>> df2 = pd.DataFrame(np.random.rand(16).reshape(4,4)*100,columns=['a','b','c','d'])>>> print(df1,'\n',df2) a b c done 36.881890 13.897714 5.237098 24.676327two 42.183000 27.146129 49.074872 56.447147three 6.935006 16.742130 5.955048 2.576066four 49.843982 64.641184 70.038643 75.103787 a b c d0 60.589246 60.305811 90.306763 46.7618241 59.296330 6.039652 52.296003 97.1499542 58.255476 13.837192 74.255506 84.0821673 55.204207 17.340171 25.056553 84.518804#单标签索引,返回Series>>> data1 = df1.loc['one'] #单标签索引返回Series >>> data2 = df2.loc[1]>>> print(data1,'\n',data2)a 36.881890b 13.897714c 5.237098d 24.676327Name: one, dtype: float64 a 59.296330b 6.039652c 52.296003d 97.149954Name: 1, dtype: float64>>>#多标签索引,顺序可变>>> data3 = df1.loc[['two','three','five']]>>> data4 = df2.loc[[3,2,1]]>>> print(data3) a b c dtwo 42.183000 27.146129 49.074872 56.447147three 6.935006 16.742130 5.955048 2.576066five NaN NaN NaN NaN #多标签索引,如果标签不存在则返回NaN >>> print(data4) a b c d3 55.204207 17.340171 25.056553 84.5188042 58.255476 13.837192 74.255506 84.0821671 59.296330 6.039652 52.296003 97.149954#切片索引 ,可以做切片对象 >>> data5 = df1.loc['one':'three'] #末端包含 >>> data6 = df2.loc[1:3]>>> print(data5) a b c done 36.881890 13.897714 5.237098 24.676327two 42.183000 27.146129 49.074872 56.447147three 6.935006 16.742130 5.955048 2.576066>>> print(data6) a b c d1 59.296330 6.039652 52.296003 97.1499542 58.255476 13.837192 74.255506 84.0821673 55.204207 17.340171 25.056553 84.518804# 核心笔记:df.loc[label]主要针对index选择行,同时支持指定index,及默认数字index
2.2.3 df.iloc[ ] - 按整数位置选择行
# df.iloc[] - 按照整数位置(从轴的0到length-1)选择行 # 类似list的索引,其顺序就是dataframe的整数位置,从0开始计>>> df = pd.DataFrame(np.random.rand(16).reshape(4,4)*100,index=['one','two','three','four'],columns=['a','b','c','d'])>>> print(df) a b c done 21.693396 38.203531 85.439983 9.740751two 28.940287 57.861274 68.467893 60.788056three 81.871777 57.813973 60.092876 1.637220four 67.789269 95.648501 62.837383 65.794259# 单位置索引; 和loc索引不同,不能索引超出数据行数的整数位置 >>> print(df.iloc[0])a 21.693396b 38.203531c 85.439983d 9.740751Name: one, dtype: float64>>> print(df.iloc[-1])a 67.789269b 95.648501c 62.837383d 65.794259Name: four, dtype: float64>>> print(df.iloc[4]) #索引超过行数了IndexError: single positional indexer is out-of-bounds# 多位置索引,顺序可变>>> print(df.iloc[[0,2]]) ##从0开始 ,第0行和第3行即末端包含 a b c done 21.693396 38.203531 85.439983 9.740751three 81.871777 57.813973 60.092876 1.637220>>> print(df.iloc[[3,2,1]]) a b c dfour 67.789269 95.648501 62.837383 65.794259three 81.871777 57.813973 60.092876 1.637220two 28.940287 57.861274 68.467893 60.788056#切片索引 >>> print(df.iloc[1:3]) #末端不包含 a b c dtwo 28.940287 57.861274 68.467893 60.788056three 81.871777 57.813973 60.092876 1.637220>>> print(df.iloc[::2]) a b c done 21.693396 38.203531 85.439983 9.740751three 81.871777 57.813973 60.092876 1.637220>>>
2.2.4布尔型索引
df < 20 df [ df < 20 ] 、 单列做判断df [ 'a' ] >20 、多列做判断df [ ['a', 'b'] ] >20 、多行做判断 df.loc[ ['one', 'three'] ] < 50
#布尔型索引 # 和Series原理相同>>> df = pd.DataFrame(np.random.rand(16).reshape(4,4)*100,index=['one','two','three','four'],columns=['a','b','c','d'])>>> print(df) a b c done 17.575951 66.534852 96.774872 94.415801two 67.485820 11.871447 19.140092 9.634462three 32.052532 8.891445 63.209949 92.451412four 6.931403 0.622515 29.972335 24.438536>>> b1 = df < 20 # 也可以书写为 df[df < 20]>>> print(b1,type(b1)) a b c done True False False Falsetwo False True True Truethree False True False Falsefour True True False False>>> print(df[b1]) a b c done 17.575951 NaN NaN NaNtwo NaN 11.871447 19.140092 9.634462three NaN 8.891445 NaN NaNfour 6.931403 0.622515 NaN NaN>>># 不做索引则会对数据每个值进行判断# 索引结果保留 所有数据:True返回原数据,False返回值为NaN >>> b2 = df['a'] > 50>>> print(b2,type(b2))one Falsetwo Truethree Falsefour FalseName: a, dtype: bool >>> print(df[b2]) #会把two为True的行保留,包括小于50的数 a b c dtwo 67.48582 11.871447 19.140092 9.634462# 单列做判断,索引结果保留单列判断为True的行数据,包括其他列 >>> b3 = df[['a','b']] > 50>>> print(b3,type(b3)) a bone False Truetwo True Falsethree False Falsefour False False >>> print(df[b3]) a b c done NaN 66.534852 NaN NaNtwo 67.48582 NaN NaN NaNthree NaN NaN NaN NaNfour NaN NaN NaN NaN# 多列做判断,索引结果保留所有数据:True返回原数据,False返回值为NaN >>>>>> b4 = df.loc[['one','three']] < 50>>> print(b4,type(b4)) a b c done True False False Falsethree True True False False >>> print(df[b4]) a b c done 17.575951 NaN NaN NaNtwo NaN NaN NaN NaNthree 32.052532 8.891445 NaN NaNfour NaN NaN NaN NaN>>># 多行做判断,索引结果保留 所有数据:True返回原数据,False返回值为NaN
2.2.5 多重索引:比如同时索引行和列
先选择列再选择行 :df[ 'a' ].loc[ ['a', 'b', 'c'] ] df [ df [ 'a' ] < 50 ].iloc[ :2 ]
#多重索引:比如同时索引行和列# 先选择列再选择行 —— 相当于对于一个数据,先筛选字段,再选择数据量>>> df = pd.DataFrame(np.random.rand(16).reshape(4,4)*100,index=['one','two','three','four'],columns=['a','b','c','d'])>>> print(df) a b c done 12.408141 98.202562 38.715980 62.978631two 93.980397 39.455335 77.214844 42.495949three 4.210569 48.999179 10.320513 51.919796four 73.838276 72.854442 98.555301 27.902682>>> print(df['a'].loc[['one','three']]) # 选择a列的one,three行one 12.408141three 4.210569Name: a, dtype: float64>>> print(df[['b','c','d']].iloc[::2]) # 选择b,c,d列的one,three行 b c done 98.202562 38.715980 62.978631three 48.999179 10.320513 51.919796>>> print(df[df['a'] < 50].iloc[:2]) # 选择满足判断索引的前两行数据 a b c done 12.408141 98.202562 38.715980 62.978631three 4.210569 48.999179 10.320513 51.919796>>>
2.3 Dataframe:基本技巧
数据查看、转置 / 添加、修改、删除值 / 对齐 / 排序
######数据查看(.head() .tail() )与转置( .T ) >>> df = pd.DataFrame(np.random.rand(16).reshape(8,2)*100,columns=['a','b'])>>> print(df) a b0 41.447858 93.9378781 29.684415 58.6379932 2.260561 23.6013273 79.555013 55.6110104 64.825361 92.4447695 53.716091 40.1668726 19.657354 47.8424877 22.705715 26.977886>>>>>> print(df.head(2)) a b0 41.447858 93.9378781 29.684415 58.637993>>> print(df.tail()) a b3 79.555013 55.6110104 64.825361 92.4447695 53.716091 40.1668726 19.657354 47.8424877 22.705715 26.977886
# .head()查看头部数据
# .tail()查看尾部数据 # 默认查看5条>>> print(df.T) 0 1 2 3 4 5 6 7a 41.447858 29.684415 2.260561 79.555013 64.825361 53.716091 19.657354 22.705715b 93.937878 58.637993 23.601327 55.611010 92.444769 40.166872 47.842487 26.977886 # .T 转置
# 添加与修改 >>> df = pd.DataFrame(np.random.rand(16).reshape(4,4)*100,columns=['a','b','c','d'])>>> print(df) a b c d0 40.395591 38.023720 64.954712 82.8336011 69.405393 77.664903 76.566145 11.2187532 61.793220 95.929196 15.415231 79.3686913 29.482119 85.228170 94.134330 25.678733>>> df['e'] = 10>>> df.loc[4] = 20>>> print(df) a b c d e0 40.395591 38.023720 64.954712 82.833601 101 69.405393 77.664903 76.566145 11.218753 102 61.793220 95.929196 15.415231 79.368691 103 29.482119 85.228170 94.134330 25.678733 104 20.000000 20.000000 20.000000 20.000000 20>>># 新增列/行并赋值 >>> df['e'] = 20>>> df[['a','c']] = 100>>> print(df) a b c d e0 100 38.023720 100 82.833601 201 100 77.664903 100 11.218753 202 100 95.929196 100 79.368691 203 100 85.228170 100 25.678733 204 100 20.000000 100 20.000000 20>>># 索引后直接修改值
# 删除 del / drop() ;inplace = False/True 、 axis = 0 为行 | axis = 1 为列 >>> df = pd.DataFrame(np.random.rand(16).reshape(4,4)*100,columns=['a','b','c','d'])>>> print(df) a b c d0 76.082974 91.636219 70.831268 82.9004431 16.328769 9.910538 36.670726 67.1874922 96.234567 16.699254 0.257354 31.0322393 16.659137 85.438085 91.993957 33.055454>>> del df['a']>>> print(df) b c d0 91.636219 70.831268 82.9004431 9.910538 36.670726 67.1874922 16.699254 0.257354 31.0322393 85.438085 91.993957 33.055454>>># del语句 - 删除列 >>> print(df.drop(0)) ##删除行 b c d1 9.910538 36.670726 67.1874922 16.699254 0.257354 31.0322393 85.438085 91.993957 33.055454>>> print(df.drop([1,2])) b c d0 91.636219 70.831268 82.9004433 85.438085 91.993957 33.055454>>> print(df) ##原数据不改变 b c d0 91.636219 70.831268 82.9004431 9.910538 36.670726 67.1874922 16.699254 0.257354 31.0322393 85.438085 91.993957 33.055454>>># drop()删除行,inplace=False → 删除后生成新的数据,不改变原数据
>>> print(df1.drop(['1'], inplace=True)) #默认为 inplace=False, inplace = True是把原数据也删除了
None>>> print(df.drop(['d'],axis = 1)) #axis = 1是删除列 ---> 用 [ ],不改变原数据 ; axis = 0是删除行,不改变原数据。 b c0 91.636219 70.8312681 9.910538 36.6707262 16.699254 0.2573543 85.438085 91.993957>>> print(df) b c d0 91.636219 70.831268 82.9004431 9.910538 36.670726 67.1874922 16.699254 0.257354 31.0322393 85.438085 91.993957 33.055454>>># drop()删除列,需要加上axis = 1,inplace=False → 删除后生成新的数据,不改变原数据
#对齐 + >>> df1 = pd.DataFrame(np.random.randn(10,4),columns=['A','B','C','D'])>>> df2 = pd.DataFrame(np.random.randn(7,3),columns=['A','B','C'])>>> print(df1) A B C D0 -0.711905 1.102947 -0.203125 0.4641601 -1.633976 -0.126530 1.437948 1.7210492 1.323383 -0.277546 0.060134 0.2070933 1.708294 0.815721 -0.151322 0.5229374 0.263572 -0.674251 -1.325148 -2.7024645 1.659823 -0.131172 -1.114735 -2.1825276 -0.186723 -0.071455 -1.370213 0.5130627 0.381603 1.265310 0.083247 1.0840618 0.399770 0.765438 -1.066299 0.6264029 0.781321 -1.612135 -0.387417 -0.673143>>> print(df2) A B C0 0.012025 -0.488556 0.2435151 -0.751000 0.277448 0.0136752 1.008712 -1.231084 -0.5233293 0.663029 -0.752602 -0.7247494 -0.755075 0.303930 1.2883355 -1.233975 -1.241185 -0.4145646 -0.251519 -1.384259 -0.996120>>> print(df1+df2) #DataFrame对象之间的数据自动按照列和索引(行标签)对齐。 A B C D0 -0.699879 0.614391 0.040390 NaN1 -2.384977 0.150917 1.451622 NaN2 2.332095 -1.508629 -0.463195 NaN3 2.371323 0.063119 -0.876071 NaN4 -0.491503 -0.370321 -0.036813 NaN5 0.425847 -1.372357 -1.529299 NaN6 -0.438242 -1.455714 -2.366333 NaN7 NaN NaN NaN NaN8 NaN NaN NaN NaN9 NaN NaN NaN NaN>>>
# 排序1 - 按值排序 .sort_values # 同样适用于Series>>> df1 = pd.DataFrame(np.random.rand(16).reshape(4,4)*100,columns=['a','b','c','d'])>>> print(df) b c d0 91.636219 70.831268 82.9004431 9.910538 36.670726 67.1874922 16.699254 0.257354 31.0322393 85.438085 91.993957 33.055454>>> print(df1.sort_values(['a'],ascending=True)) #升序; ascending参数:设置升序降序,默认升序。 a b c d0 3.255012 35.188882 99.290551 67.8975801 43.221583 36.144081 84.124544 18.8449673 47.364524 41.530226 20.800088 22.5971982 83.170528 1.550416 7.810286 61.375057>>> print(df1.sort_values(['a'],ascending=False)) #降序 a b c d2 83.170528 1.550416 7.810286 61.3750573 47.364524 41.530226 20.800088 22.5971981 43.221583 36.144081 84.124544 18.8449670 3.255012 35.188882 99.290551 67.897580# 单列排序 >>> df2 = pd.DataFrame({ 'a':[1,1,1,1,2,2,2,2],'b':list(range(8)),'c':list(range(8,0,-1))})>>> print(df2) a b c0 1 0 81 1 1 72 1 2 63 1 3 54 2 4 45 2 5 36 2 6 27 2 7 1>>> print(df2.sort_values(['a','c'])) #多列排序,按列顺序排序。 a b c 3 1 3 52 1 2 61 1 1 70 1 0 87 2 7 16 2 6 25 2 5 34 2 4 4>>>
# 排序2 - 索引排序 .sort_index >>> df1 = pd.DataFrame(np.random.rand(16).reshape(4,4)*100,index=[5,4,3,2],columns=['a','b','c','d'])>>> df2 = pd.DataFrame(np.random.rand(16).reshape(4,4)*100,index=['h','s','x','g'],columns=['a','b','c','d'])>>> print(df1) a b c d5 70.899006 29.653652 38.273239 99.2549314 68.173016 27.051275 43.236560 48.5730183 35.870577 41.990773 78.055733 63.5813522 20.946046 19.712039 33.906534 89.749668>>> print(df1.sort_index()) a b c d2 20.946046 19.712039 33.906534 89.7496683 35.870577 41.990773 78.055733 63.5813524 68.173016 27.051275 43.236560 48.5730185 70.899006 29.653652 38.273239 99.254931>>> print(df2) a b c dh 62.234181 32.481881 83.483145 39.145470s 41.003081 16.515826 19.958257 30.331726x 60.486728 20.206607 91.149820 31.731089g 22.132468 61.116998 19.929379 98.976248>>> print(df2.sort_index()) a b c dg 22.132468 61.116998 19.929379 98.976248h 62.234181 32.481881 83.483145 39.145470s 41.003081 16.515826 19.958257 30.331726x 60.486728 20.206607 91.149820 31.731089>>># 按照index排序# 默认 ascending=True, inplace=False