python之pandas模块_in <module> import pandas as pd-CSDN博客

本文深入讲解了Pandas库的高级应用，包括Series与DataFrame的创建、操作与运算，以及日期处理、数据读写、分组聚合等功能，适用于希望提升数据分析技能的读者。

摘要生成于 C知道，由 DeepSeek-R1 满血版支持，前往体验 >

首先导入所需模块pandas

import pandas as pd
import numpy as np
import  string

Series数据类型

1.pandas创建Series数据类型
1).通过列表创建Series对象
不指定索引：

array = ["粉条", "粉丝", "粉带"]
s1 = pd.Series(data=array)  # 如果不指定索引，默认从0开始
print(s1)
# 0    粉条
# 1    粉丝
# 2    粉带
# dtype: object

指定索引：

array = ["粉条", "粉丝", "粉带"]
s1 = pd.Series(data=array,index=['A','B','C'])  # 指定索引
print(s1)
# A    粉条
# B    粉丝
# C    粉带
# dtype: object

2).通过numpy的对象Ndarray创建Series

n = np.random.randn(5)
s2 = pd.Series(data=n)
print(s2)
#
0    1.603141
1    1.451584
2    1.011957
3    1.234106
4   -0.728547
dtype: float64

3).通过字典创建Series对象

dict = {string.ascii_lowercase[i]:i for i in range(10)}
s3 = pd.Series(dict)
print(s3)
#
a    0
b    1
c    2
d    3
e    4
f    5
g    6
h    7
i    8
j    9
dtype: int64

2.Series的基本操作
1).修改Series索引

array = ["粉条", "粉丝", "粉带"]
s1 = pd.Series(data=array)
print(s1)
# 
0    粉条
1    粉丝
2    粉带
dtype: object

print(s1.index)
# RangeIndex(start=0, stop=3, step=1)

s1.index = ['A', 'B', 'C'] # 修改索引为A，B，C
print(s1)
# 
A    粉条
B    粉丝
C    粉带
dtype: object

2). Series纵向拼接

array = ["粉条", "粉丝", "粉带"]
s1 = pd.Series(data=array)
array1 = ["粉条", "粉丝", "粉带"]
s2 = pd.Series(data=array1)
s3 = s1.append(s2)
print(s3)
#
A    粉条
B    粉丝
C    粉带
0    粉条
1    粉丝
2    粉带
dtype: object

3). 删除指定索引对应的元素

array = ["粉条", "粉丝", "粉带"]
s1 = pd.Series(data=array,index=['A','B','C'])  # 指定索引
print(s1)
s4 = s1.drop('C')  # 删除索引为‘C’对应的值;
print(s4)
#
A    粉条
B    粉丝
dtype: object

4). 根据指定的索引查找元素

array = ["粉条", "粉丝", "粉带"]
s3 = pd.Series(data=array,index=['A','B','C'])  # 指定索引
print(s3['B'])
# 粉丝

s3['B'] = np.nan
print(s3)
# 
A     粉条
B    NaN
C    粉带
dtype: object

5). 切片操作

array = ["粉条", "粉丝", "粉带"]
s3 = pd.Series(data=array,index=['A','B','C'])  # 指定索引

print(s3[:2]) # 显示前两个元素
# 
A    粉条
B    粉丝
dtype: object

print(s3[::-1]) # 反转
#
C    粉带
B    粉丝
A    粉条
dtype: object

print(s3[-2:])  # 显示最后两个元素
#
B    粉丝
C    粉带
dtype: object

3.Series运算

import pandas as pd
import numpy as np
import  string

s1  = pd.Series(np.arange(5), index=list(string.ascii_lowercase[:5]))
s2  = pd.Series(np.arange(2, 8), index=list(string.ascii_lowercase[2:8]))

print(s1)
# 
a    0
b    1
c    2
d    3
e    4
dtype: int64

print(s2)
# 
c    2
d    3
e    4
f    5
g    6
h    7
dtype: int64

1).加法
相同索引的数值相加，没有相同索引的值为缺失值

print(s1 + s2)
# print(s1.add(s2)) # 等同于‘+’
# 
a    NaN
b    NaN
c    4.0
d    6.0
e    8.0
f    NaN
g    NaN
h    NaN
dtype: float64

2).减法

print(s1 - s2)
# 等同于 print(s1.sub(s2))
# 
a    NaN
b    NaN
c    0.0
d    0.0
e    0.0
f    NaN
g    NaN
h    NaN
dtype: float64

3).乘法

print(s1 * s2)
# print(s1.mul(s2))
# 
a     NaN
b     NaN
c     4.0
d     9.0
e    16.0
f     NaN
g     NaN
h     NaN
dtype: float64

4).除法

print(s1 / s2)
# print(s1.div(s2))
#
a    NaN
b    NaN
c    1.0
d    1.0
e    1.0
f    NaN
g    NaN
h    NaN
dtype: float64

5).求中位数

print(s1.median())
# 2.0

6).求和

print(s1.sum())
# 10

7).最大值

print(s1.max())
# 4

8).最小值

print(s1.min())
# 0

9).特殊的where方法
Series中的where方法运行结果和numpy中完全不同

s1 = pd.Series(np.arange(5), index=list(string.ascii_lowercase[:5]))
print(s1)
# 
a    0
b    1
c    2
d    3
e    4
dtype: int64

# 对象中不大于3的元素赋值为缺失值
print(s1.where(s1 > 3))
# 
a    NaN
b    NaN
c    NaN
d    NaN
e    4.0
dtype: float64

# 对象中不大于3的元素赋值为10；
print(s1.where(s1 > 3, 10))
# 
a    10
b    10
c    10
d    10
e     4
dtype: int64

# 对象中大于3的元素赋值为10；
print(s1.mask(s1 > 3, 10))
#
a     0
b     1
c     2
d     3
e    10
dtype: int64

DataFrame数据类型

1.pandas创建DataFrame数据类型
1).通过列表创建

li = [
    [1, 2, 3, 4],
    [2, 3, 4, 5]
]

# DataFRame对象里面包含两个索引， 行索引(0轴， axis=0)， 列索引(1轴， axis=1)
d1 = pd.DataFrame(data=li, index=['A', 'B'], columns=['views', 'loves', 'comments', 'tranfers'])
print(d1)
# 
   views  loves  comments  tranfers
A      1      2         3         4
B      2      3         4         5

2).通过numpy对象创建

narr = np.arange(8).reshape(2, 4)
# DataFRame对象里面包含两个索引， 行索引(0轴， axis=0)， 列索引(1轴， axis=1)
d2 = pd.DataFrame(data=narr, index=['A', 'B'], columns=['views', 'loves', 'comments', 'tranfers'])
print(d2)
# 
   views  loves  comments  tranfers
A      0      1         2         3
B      4      5         6         7

3).通过字典的方式创建

dict = {
    'views': [1, 2 ],
    'loves': [2, 3 ],
    'comments': [3, 4 ]
}
d3 = pd.DataFrame(data=dict, index=['粉条', "粉丝"])
print(d3)
#
    views  loves  comments
粉条      1      2         3
粉丝      2      3         4

有关日期的操作pd.date_range()

dates = pd.date_range(start='1/1/2018', end='1/08/2018')
print(dates)
# 默认按天数来分隔
DatetimeIndex(['2018-01-01', '2018-01-02', '2018-01-03', '2018-01-04',
               '2018-01-05', '2018-01-06', '2018-01-07', '2018-01-08'],
              dtype='datetime64[ns]', freq='D')

例1：

# 行索引
dates = pd.date_range(start='today', periods=6)
# 数据
data_arr = np.random.randn(6, 4)
# 列索引
columns = ['A', 'B', 'C', 'D']
d4 = pd.DataFrame(data_arr, index=dates, columns=columns)
print(d4)
# 
                                   A         B         C         D
2019-02-27 14:57:47.555758  0.120459  0.020717 -0.231611 -0.467826
2019-02-28 14:57:47.555758 -0.533083  0.900358 -1.166000 -1.356933
2019-03-01 14:57:47.555758  0.429399  1.589066  0.062517 -1.709545
2019-03-02 14:57:47.555758 -2.182392  1.434439  0.106964  0.872465
2019-03-03 14:57:47.555758  0.111128 -1.365457  0.185828 -0.731432
2019-03-04 14:57:47.555758 -1.450903 -0.054512  0.112919 -0.210644

例2：一维对象: 建立一个以2019年每一天作为索引，值为随机数

dates = pd.date_range(start='1/1/2019', end='12/31/2019', freq='D')
datas = np.random.randn(len(dates))
s1 = pd.Series(datas, index=dates)
print(s1[:3])  # 这里之显示前三行
# 
2019-01-01    0.311031
2019-01-02    0.279827
2019-01-03    0.392495
Freq: D, dtype: float64

2.DataFramede基本操作

narr = np.arange(8).reshape(2, 4)
# DataFRame对象里面包含两个索引， 行索引(0轴， axis=0)， 列索引(1轴， axis=1)
d2 = pd.DataFrame(data=narr, index=['A', 'B'], columns=['views', 'loves', 'comments', 'tranfers'])
print(d2)
# 
   views  loves  comments  tranfers
A      0      1         2         3
B      4      5         6         7

1).查看基础属性

print(d2.shape)  # 获取行数和列数
# (2, 4)

print(d2.dtypes)  # 列数据类型
# 
views       int64
loves       int64
comments    int64
tranfers    int64
dtype: object

print(d2.ndim)  # 获取数据的维度
# 2

print(d2.index) # 行索引
# Index(['A', 'B'], dtype='object')

print(d2.columns) # 列索引
# Index(['views', 'loves', 'comments', 'tranfers'], dtype='object')

print(d2.values, type(d2.values))   # 对象的值， 二维ndarray数组;
#
 [[0 1 2 3]
 [4 5 6 7]] <class 'numpy.ndarray'>

2). 数据整体状况的查询
显示头部或尾部的几行

print(d2.head(1))  # 显示头部的几行， 默认5行
# 
   views  loves  comments  tranfers
A      0      1         2         3

print(d2.tail(1))  # 显示头部的尾行， 默认5行
# 
   views  loves  comments  tranfers
B      4      5         6         7

相关信息的预览：行数，列数，列类型，内存占用

# 相关信息的预览： 行数， 列数， 列类型， 内存占用
print("info:", d2.info())
# 
<class 'pandas.core.frame.DataFrame'>
Index: 2 entries, A to B
Data columns (total 4 columns):
views       2 non-null int64
loves       2 non-null int64
comments    2 non-null int64
tranfers    2 non-null int64
dtypes: int64(4)
memory usage: 80.0+ bytes
info: None

快速综合用计结果：计数，均值，标准差，最小值， 1/4位数，中位数， 3/4位数，最大值

print(d2.describe())
# 
          views     loves  comments  tranfers
count  2.000000  2.000000  2.000000  2.000000
mean   2.000000  3.000000  4.000000  5.000000
std    2.828427  2.828427  2.828427  2.828427
min    0.000000  1.000000  2.000000  3.000000
25%    1.000000  2.000000  3.000000  4.000000
50%    2.000000  3.000000  4.000000  5.000000
75%    3.000000  4.000000  5.000000  6.000000
max    4.000000  5.000000  6.000000  7.000000

转置操作

print(d2.T)
# 
          A  B
views     0  4
loves     1  5
comments  2  6
tranfers  3  7

按列进行排序

print(d2)
# 
   views  loves  comments  tranfers
A      0      1         2         3
B      4      5         6         7

# 按照指定列进行排序， 默认是升序， 如果需要降序显示，设置ascending=False;
print(d2.sort_values(by="views", ascending=False))
# 
   views  loves  comments  tranfers
B      4      5         6         7
A      0      1         2         3

切片及查询

print(d2[:1])   # 可以实现切片， 但是不能索引;
# 
   views  loves  comments  tranfers
A      0      1         2         3

print('1:\n', d2['views'])   # 通过标签查询， 获取单列信息
# 1:
 A    0
B    4
Name: views, dtype: int64

print('2:\n', d2.views)   # 和上面是等价的;
# 2:
 A    0
B    4
Name: views, dtype: int64

print(d2[['views', 'comments']])  # 通过标签查询多列信息
# 
   views  comments
A      0         2
B      4         6

通过类似索引的方式查询
iloc(通过位置进行行数据的获取)

print(d2.iloc[0])
# 
views       0
loves       1
comments    2
tranfers    3
Name: A, dtype: int64

print(d2.iloc[-1:])
# 
   views  loves  comments  tranfers
B      4      5         6         7

loc(t通过标签索引行数据)

print(d2.loc['A'])
# 
views       0
loves       1
comments    2
tranfers    3
Name: A, dtype: int64

更改pandas的值

d2.loc['A'] = np.nan
print(d2)
# 
views  loves  comments  tranfers
A    NaN    NaN       NaN       NaN
B    4.0    5.0       6.0       7.0

从文件中读取，保存数据

import pandas as pd
df = pd.DataFrame(
    {'province': ['陕西', '陕西', '四川', '四川', '陕西'],
     'city': ['咸阳', '宝鸡', '成都', '成都', '宝鸡'],
     'count1': [1, 2, 3, 4, 5],
     'count2': [1, 2, 33, 4, 5]
     }
)

df.to_csv('doc/csvFile.csv')   # csv文件的写入

df2 = pd.read_csv('doc/csvFile.csv')  # csv文件的读取

df2.to_excel("/tmp/excelFile.xlsx", sheet_name="省份统计")  # excel文件的写入

分组与聚合操作之groupby

pandas提供了一个灵活高效的groupby功能，
1). 它使你能以一种自然的方式对数据集进行切片、切块、摘要等操作。
2). 根据一个或多个键（可以是函数、数组或DataFrame列>名）拆分pandas对象。
3). 计算分组摘要统计，如计数、平均值、标准差，或用户自定义函数。
例：

import pandas as pd
df = pd.DataFrame(
    {'province': ['陕西', '陕西', '四川', '四川', '陕西'],
     'city': ['咸阳', '宝鸡', '成都', '成都', '宝鸡'],
     'count1': [1, 2, 3, 4, 5],
     'count2': [1, 2, 33, 4, 5]
     }
)

1.根据省市统计进行统计分析

grouped = df['count1'].groupby(df['province'])
print(grouped.describe())
# 
          count      mean       std  min   25%  50%   75%  max
province                                                      
四川          2.0  3.500000  0.707107  3.0  3.25  3.5  3.75  4.0
陕西          3.0  2.666667  2.081666  1.0  1.50  2.0  3.50  5.0

2.根据城市统计分析cpunt1的信息

grouped = df['count1'].groupby(df['city'])
print(grouped.max())
# 
city
咸阳    1
宝鸡    5
成都    4
Name: count1, dtype: int64

3.指定多个key值进行分类聚合

grouped = df['count1'].groupby([df['province'], df['city']])

print(grouped.max())
# 
四川        成都      4
陕西        咸阳      1
           宝鸡      5
Name: count1, dtype: int64
           
print(grouped.sum())
# 
province  city
四川        成都      7
陕西        咸阳      1
          宝鸡      7
Name: count1, dtype: int64

print(grouped.count())
# 
province  city
四川        成都      2
陕西        咸阳      1
          宝鸡      2
Name: count1, dtype: int64

4.通过unstack方法，实现层次化的索引

print(grouped.max().unstack())
# 
city       咸阳   宝鸡   成都
province               
四川        NaN  NaN  4.0
陕西        1.0  5.0  NaN