@1007477689
2020-05-19T10:28:58.000000Z
字数 13351
阅读 973
Pandas学习练习
Python
Pandas
import pandas as pd
pd.__version__
arry = [0, 1, 2, 3, 4]
df = pd.Series(data = arry)
df
# 如果不指定索引,则默认从 0 开始
>>> 0 0
1 1
2 2
3 3
4 4
dtype: int64
data = {'a':1, 'b':2, 'c':3, 'd':4, 'e':5}
df = pd.Series(data = data)
df
>>> a 1
b 2
c 3
d 4
e 5
dtype: int64
import numpy as np
dates = pd.date_range(start = 'today', periods = 6)
num_arr = np.random.randn(6, 4)
columns = ['A', 'B', 'C', 'D']
df = pd.DataFrame(data = num_arr, index = dates, columns = columns)
df
# pd.date_range('today', periods = 6) 定义时间序列作为 index
# np.random.randn(6, 4) 传入 numpy 随机数组
# columns = ['A', 'B', 'C', 'D'] 将列表作为列名
df = pd.read_csv(filepath_or_buffer = 'test.csv', encoding = 'gbk', sep = ';')
df
df.to_csv('test.csv')
# df.to_csv('output.csv') 将df存储为csv文件,文件名为output.csv
import numpy as np
data = {'animal':['cat', 'cat', 'snake', 'dog', 'dog', 'cat', 'snake', 'cat', 'dog', 'dog'],
'age': [2.5, 3, 0.5, np.nan, 5, 2, 4.5, np.nan, 7, 3],
'visits': [1, 3, 2, 3, 2, 3, 1, 1, 2, 1],
'priority': ['yes', 'yes', 'no', 'yes', 'no', 'no', 'no', 'yes', 'no', 'no']}
index_labels = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j']
df = pd.DataFrame(data = data, index = index_labels)
df
# 方法一
df.info()
# 方法二
df.describe()
# 方法一
df.iloc[:3]
# 方法二
df.head(n = 3)
# 方法一
df.loc[:, ['animal', 'age']]
# 方法二
df[['animal', 'age']]
# 里面还有一个[]
df.loc[df.index[[3, 4, 8]], ['animal', 'age']]
df[df['age'] > 3]
df['age'].isnull()
>>> a False
b False
c False
d True
...
g False
h True
i False
j False
Name: age, dtype: bool
df[df['age'].isnull()]
# 方法一
df[(df['age'] > 2) & (df['age'] < 4)]
# 方法二
df[df['age'].between(2, 4)]
df.loc['f', 'age'] = 1.5
df
df['visits'].sum()
df.groupby('animal')['age'].mean()
# 插入行
df.loc['k'] = [5.5, 'dog', 'no', 2]
# 删除行
df = df.drop('k')
df['animal'].value_counts()
# .value_counts() 每种的值
df.sort_values(by = ['age', 'visits'], ascending = [False, True])
# False 为降序,True 为升序
df['priority'] = df['priority'].map({'yes': True, 'no': False})
df
df['animal'] = df['animal'].replace(to_replace = 'snake', value = 'python')
df
df.age.dtypes
df.age = df.age.astype(dtype = 'float')
# 这里要把 age 转换为 'float'
# 如果不转换,直接调用df.pivot_table(index = 'animal', columns = 'visits', values = 'age', aggfunc = 'mean')会出错
df.age.dtypes
>>> dtype('float64')
df.pivot_table(index = 'animal', columns = 'visits', values = 'age', aggfunc = 'mean')
data = {'A': [1, 2, 2, 3, 2, 5, 5, 5, 6, 7, 7]}
df = pd.DataFrame(data = data)
# 方法一:
df1 = df.loc[df['A'].shift() != df['A']]
print(df1)
# df[column].shift(period = 1, axis = 0)
# 方法一只能删除连续行重复的行,不连续的重复行不能删除。
# 方法二:
df1 = df.drop_duplicates(subset = 'A')
# drop_duplicate() 方法是对 DataFrame 格式的数据,去除特定列下面的重复行。返回 DataFrame 格式的数据。
df = pd.DataFrame(data = np.random.random(size = (5, 3)))
df_sub = df.sub(other = df.mean(axis = 1), axis = 0)
# df.sub(other = df.mean(axis = 1), axis = 0)先取这一行的平均数,然后按照列来减。
list('abcde')
>>> ['a', 'b', 'c', 'd', 'e']
df = pd.DataFrame(data = np.random.random(size = (5, 5)), columns = list('abcde'))
print(df)
df.sum().idxmin()
df = pd.DataFrame({'A': list('aaabbcaabcccbbc'),
'B': [12, 345, 3, 1, 45, 14, 4, 52, 54, 23, 235, 21, 57, 3, 87]})
df1 = df.groupby(by = 'A')['B'].nlargest(n = 3)
>>> A
a 1 345
7 52
0 12
b 12 57
8 54
4 45
c 10 235
14 87
9 23
Name: B, dtype: int64
df1 = df.groupby(by = 'A')['B'].nlargest(n = 3).sum(level = 0)
# level 如果轴是多指标(分层),则沿特定级别计算,并折叠成一个序列。
>>> A
a 409 (=345 + 52 + 12)
b 156 (= 57 + 54 + 45)
c 345 (= 235 + 87 + 23)
Name: B, dtype: int64
data = {'A': [1, 2, 11, 11, 33, 34, 35, 40, 79, 99],
'B': [1, 2, 11, 11, 33, 34, 35, 40, 79, 99]}
df = pd.DataFrame(data = data)
groupby_rule = pd.cut(x = df['A'], bins = np.arange(start = 0, stop = 101, step = 10))
groupby_rule
>>> 0 (0, 10]
1 (0, 10]
2 (10, 20]
3 (10, 20]
4 (30, 40]
5 (30, 40]
6 (30, 40]
7 (30, 40]
8 (70, 80]
9 (90, 100]
Name: A, dtype: category
Categories (10, interval[int64]): [(0, 10] < (10, 20] < (20, 30] < (30, 40] ... (60, 70] < (70, 80] < (80, 90] < (90, 100]]
df1 = df.groupby(by = groupby_rule)['B'].sum()
df1
# pandas.cut(x, bins, right = True, labels = None, retbins = False, precision = 3, include_lowest = False, duplicates = 'raise')
# x:被切分的类数组(array-like)数据,必须是 1 维的(不能用DataFrame)
# bins:bins 是被切割后的区间,有3中形式:一个 int 型的标量、标量序列(数组)或者 pandas.IntervalIndex
>>> A
(0, 10] 3
(10, 20] 22
(20, 30] 0
(30, 40] 142
(40, 50] 0
(50, 60] 0
(60, 70] 0
(70, 80] 79
(80, 90] 0
(90, 100] 99
Name: B, dtype: int64
data = {'X': [7, 2, 0, 3, 4, 2, 5, 0, 3, 4]}
df = pd.DataFrame(data = data)
# 方法一
izero = np.r_[-1, (df['X'] == 0).to_numpy().nonzero()[0]]
# 标记0的位置
idx = np.arange(len(df))
df['Y'] = idx - izero[np.searchsorted(izero - 1, idx) - 1]
print(df)
# 方法二
x = (df['X'] != 0).cumsum()
y = x != x.shift()
df['Y'] = y.groupby((y != y.shift()).cumsum()).cumsum()
# 方法三
df['Y'] = df.groupby((df['X'] == 0).cumsum()).cumcount()
first_zero_idx = (df['X'] == 0).idxmax()
df['Y'].iloc[0:first_zero_idx] += 1
df = pd.DataFrame(data = np.random.random(size = (5, 3)))
df.unstack()
>>> 0 0 0.481196
1 0.375653
2 0.150875
3 0.049221
4 0.091384
1 0 0.523462
1 0.951614
2 0.097590
3 0.826970
4 0.589755
2 0 0.893346
1 0.966250
2 0.811824
3 0.658163
4 0.863555
dtype: float64
df.unstack().sort_values()
>>> 0 3 0.049221
4 0.091384
1 2 0.097590
0 2 0.150875
1 0.375653
0 0.481196
1 0 0.523462
4 0.589755
2 3 0.658163
2 0.811824
1 3 0.826970
2 4 0.863555
0 0.893346
1 1 0.951614
2 1 0.966250
dtype: float64
df.unstack().sort_values()[-3:].index.tolist()
# .sort_values()[-3:]从倒数第三大个值往上升
# .index.tolist() 取得它们的坐标
data = {'grps': list('aaabbcaabcccbbc'),
'vals': [-12, 345, 3, 1, 45, 14, 4, -52, 54, 23, -235, 21, 57, 3, 87]}
df = pd.DataFrame(data = data)
def replace(group):
mask = group < 0
group[mask] = group[~mask].mean()
return group
df['vals'] = df.groupby(['grps'])['vals'].transform(replace)
print(df)
# transform(函数) -> 返回一个列向量,该列向量和原子数据帧等长,然后各个子数据帧还是和原来一样的行数
# apply() 与transform() 的相同点与不同点
# 相同点:
# 都能针对 dataframe 完成特征的计算,并且常常与groupby()方法一起使用。
# 不同点:
# apply()里面可以跟自定义的函数,包括简单的求和函数以及复杂的特征间的差值函数等
# (注:apply不能直接使用agg()方法 / transform()中的python内置函数,例如sum、max、min、’count‘等方法)
# transform() 里面不能跟自定义的特征交互函数.
# 因为transform是真针对每一元素(即每一列特征操作)进行计算,也就是说在使用 transform() 方法时,需要记得三点:
# 1、它只能对每一列进行计算,所以在groupby()之后,.transform()之前是要指定要操作的列,这点也与apply有很大的不同。
# 2、由于是只能对每一列计算,所以方法的通用性相比apply()就局限了很多,例如只能求列的最大/最小/均值/方差/分箱等操作
# 3、transform还有什么用呢?最简单的情况是试图将函数的结果分配回原始的dataframe。也就是说返回的shape是(len(df),1)。
# 注:如果与groupby()方法联合使用,需要对值进行去重。
df = pd.DataFrame({'group': list('aabbabbbabab'),
'value': [1, 2, 3, np.nan, 2, 3, np.nan, 1, 7, 3, np.nan, 8]})
g1 = df.groupby(['group'])['value']
g2 = df.fillna(0).groupby(['group'])['value']
s = g2.rolling(3, min_periods = 1).sum() / g1.rolling(3, min_periods = 1).count()
s.reset_index(level = 0, drop = True).sort_index()
dti = pd.date_range(start = '2015-01-01', end = '2015-12-31', freq = 'B')
dti
>>> DatetimeIndex(['2015-01-01', '2015-01-02', '2015-01-05', '2015-01-06', '2015-01-07',
'2015-01-08', '2015-01-09', '2015-01-12', '2015-01-13', '2015-01-14',
...
'2015-12-18', '2015-12-21', '2015-12-22', '2015-12-23', '2015-12-24',
'2015-12-25', '2015-12-28', '2015-12-29', '2015-12-30', '2015-12-31'],
dtype = 'datetime64[ns]', length = 261, freq = 'B')
s = pd.Series(data = np.random.rand(len(dti)), index = dti)
s.head(10)
# 这里 2015 所有工作日已经作为随机值的索引了,只是在这里取了前十个而已
>>> 2015-01-01 0.668115
2015-01-02 0.959374
2015-01-05 0.032494
2015-01-06 0.775584
2015-01-07 0.424158
2015-01-08 0.901054
2015-01-09 0.664729
2015-01-12 0.221497
2015-01-13 0.839977
2015-01-14 0.440950
Freq: B, dtype: float64
s.index.weekday == 3
>>> array([ True, False, False, False, False, True, False, False, False,
...,
False, False, False, True, False, False, False, False, True])
s[s.index.weekday == 3].sum()
# 星期三对应的是 weekday = 3,而不是 2
s.resample(rule = 'M')
# DataFrame.resample(rule, axis, closed)
# 用 resample 后,得到的每个月是月底
# Convenience method for frequency conversion and resampling of time series. Object must have a datetime-like index (DatetimeIndex, PeriodIndex, or TimedeltaIndex), or pass datetime-like values to the on or level keyword.
>>> DatetimeIndexResampler [freq = <MonthEnd>, axis = 0, closed = right, label = right, convention = start, base = 0]
s.resample(rule = 'M').mean()
>>> 2015-01-31 0.518175
2015-02-28 0.525824
2015-03-31 0.530855
2015-04-30 0.478707
2015-05-31 0.439820
2015-06-30 0.522842
2015-07-31 0.566864
2015-08-31 0.458453
2015-09-30 0.477120
2015-10-31 0.525901
2015-11-30 0.405665
2015-12-31 0.453303
Freq: M, dtype: float64
s.groupby(pd.Grouper(freq = '4M')).idxmax()
# 前面的索引为这一组的头,后面的值为最大值所在的日期。
pd.date_range(start = '2015-01-01', end = '2016-12-31', freq = 'WOM-3THU')
# WOM-1MON、WOM-2MON… WeekOfMonth 产生每月第一、二、三、四周的星期几,例如WOM-1MON表示每月的第一个星期一
# 参考文献:https://blog.csdn.net/misxu890312/article/details/100192129。
>>> DatetimeIndex(['2015-01-15', '2015-02-19', '2015-03-19', '2015-04-16', '2015-05-21',
'2015-06-18', '2015-07-16', '2015-08-20', '2015-09-17', '2015-10-15',
'2015-11-19', '2015-12-17', '2016-01-21', '2016-02-18', '2016-03-17',
'2016-04-21', '2016-05-19', '2016-06-16', '2016-07-21', '2016-08-18',
'2016-09-15', '2016-10-20', '2016-11-17', '2016-12-15'],
dtype = 'datetime64[ns]', freq = 'WOM-3THU')
df['FlightNumber'] = df['FlightNumber'].interpolate().astype(int)
df
# .interpolate() 会将值等距离插值,这样就使得缺失值等距离填充进去了。
data = {'From_To': ['LoNDon_paris', 'MAdrid_miLAN', 'londON_StockhOlm', 'Budapest_PaRis', 'Brussels_londOn'],
'FlightNumber': [10045, np.nan, 10065, np.nan, 10085],
'RecentDelays': [[23, 47], [], [24, 43, 87], [13], [67, 32]],
'Airline': ['KLM(!)', '<Air France> (12)', '(British Airways. )', '12. Air France', '"Swiss Air"']}
df = pd.DataFrame(data = data)
df
>>> From_To Flight Number RecentDelays Airline
0 LoNDon_paris 10045.0 [23, 47] KLM(!)
1 MAdrid_miLAN NaN [] <Air France> (12)
2 londON_StockhOlm 10065.0 [24, 43, 87] (British Airways. )
3 Budapest_PaRis NaN [13] 12. Air France
4 Brussels_londOn 10085.0 [67, 32] "Swiss Air"
df['FlightNumber'] = df['FlightNumber'].interpolate().astype(int)
temp = df.From_To.str.split('_', expand = True)
temp.columns = ['From', 'To']
df = df.join(temp)
df = df.drop('From_To', axis = 1)
df
# df.drop('From_To', axis = 1)
# expand : 布尔值,默认为False.如果为真返回数据框(DataFrame)或复杂索引(MultiIndex);
# 如果为假,返回序列(Series)或者索引(Index)
# temp.columns = ['From', 'To']使得将拆分后形成的DataFrame两列命名为From和To
# 调用df.mean(axis=1),我们将得到按行计算的均值。
# 然而,如果我们调用 df.drop((name, axis=1),我们实际上删掉了一列,而不是一行。
df['From'] = df['From'].str.capitalize()
df['To'] = df['To'].str.capitalize()
df
# 使得首字母变为大写,其余变为小写。
df['Airline'] = df['Airline'].str.extract(pat = '([a-zA-Z\s]+)', expand = False).str.strip()
df
# .str.extract(pat = '([a-zA-Z\s]+)', expand = False) 删除了这些data中的标点符号
# .str.strip() 方法用于移除字符串头尾指定的字符(默认为空格或换行符)或字符序列。
# 注意:该方法只能删除开头或是结尾的字符,不能删除中间部分的字符。
delays = df['RecentDelays'].apply(func = pd.Series)
# 使得列中数组被拆分为 Series
# delays = df['RecentDelays'].apply(func = pd.Series)
>>> 0 1 2
0 23.0 47.0 NaN
1 NaN NaN NaN
2 24.0 43.0 87.0
3 13.0 NaN NaN
4 67.0 32.0 NaN
delays.columns
>>> RangeIndex(start = 0, stop = 3, step = 1)
# delays.columns = ['delay_{}'.format(n) for n in range(1, len(delays.columns) + 1)]
delays.columns = [f'delay_{num}' for num in range(start = 1, stop = len(delays.columns) + 1)]
delays
# [f'delay_{num}' for num in range(start = 1, stop = len(delays.columns) + 1)] 使得列名循环。
>>> Index(['delay_1', 'delay_2', 'delay_3'], dtype = 'object')
df = df.drop('RecentDelays', axis = 1).join(delays)
df
letters = ['A', 'B', 'C']
numbers = list(range(4))
mi = pd.MultiIndex.from_product([letters, numbers])
# pd.MultiIndex.from_product()创建一个多层索引,这个多层索引中ABC分别对应四个值。
>>> MultiIndex(levels = [['A', 'B', 'C'], [0, 1, 2, 3]],
labels = [[0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2], [0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3]])
>>> MultiIndex([('A', '0'),
('A', '1'),
('A', '2'),
('A', '3'),
('B', '0'),
('B', '1'),
('B', '2'),
('B', '3'),
('C', '0'),
('C', '1'),
('C', '2'),
('C', '3')])
s = pd.Series(data = np.random.rand(12), index = mi)
s
>>> A 0 0.796546
1 0.554168
2 0.904286
3 0.266291
B 0 0.649349
1 0.502057
2 0.023291
3 0.117036
C 0 0.889227
1 0.907910
2 0.018923
3 0.038807
dtype: float64
# 方法一
s.index.is_lexsorted()
# 方法二
s.index.lexsort_depth == s.index.nlevels
# 如果按照字典顺序排序,那么.is_lexsorted()的返回值为True。
s.loc[:, [1, 3]]
# 第一个:使得一级索引从首到末
# 1, 3为二级索引的第一行和第三行,如果为1:3这表示一行到三行。
# 方法一
s.loc[pd.IndexSlice[:'B', 2:]]
# 方法二
# s.loc[slice(None, 'B'), slice(2, None)]
#方法一
s.sum(level = 0)
#方法二
#s.unstack().sum(axis = 0)
# s.sum(level = 0)表示计算每一级的和,如果为s.sum()则计算所有数据的和。
new_s = s.swaplevel(0, 1)
print(new_s)
print(new_s.index.is_lexsorted())
new_s = new_s.sort_index()
print(new_s)
# s.swaplevel(0, 1)交换第一维和第二维的索引等级。
# .sort_index()索引排序。
import matplotlib.pyplot as plt
data = {"xs": [1, 5, 2, 8, 1],
"ys": [4, 2, 1, 9, 6]}
df = pd.DataFrame(data = data)
plt.style.use(style = 'ggplot')
# plt.style.use定制画布风格
# 参考文献:https://zhuanlan.zhihu.com/p/37891729
df.plot.scatter(x = "xs", y = "ys", color = "black", marker = "x")
# .plot.scatter()绘制散点图
<matplotlib.axes._subplots.AxesSubplot at 0x18387eb3d88>
data = {"productivity": [5, 2, 3, 1, 4, 5, 6, 7, 8, 3, 4, 8, 9],
"hours_in": [1, 9, 6, 5, 3, 9, 2, 9, 1, 7, 4, 2, 2],
"happiness": [2, 1, 3, 2, 3, 1, 2, 3, 1, 2, 2, 1, 3],
"caffienated": [0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0]}
df = pd.DataFrame(data = data)
df.plot.scatter(x = "hours_in", y = "productivity", s = df.happiness * 100, c = df.caffienated)
# s 标记大小
# c 标记颜色
data = {"revenue": [57, 68, 63, 71, 72, 90, 80, 62, 59, 51, 47, 52],
"advertising": [2.1, 1.9, 2.7, 3.0, 3.6, 3.2, 2.7, 2.4, 1.8, 1.6, 1.3, 1.9],
"month": range(12)}
df = pd.DataFrame(data = data)
ax = df.plot.bar(x = "month", y = "revenue", color = "green")
df.plot.line(x = "month", y = "advertising", secondary_y = True, ax = ax)
ax.set_xlim((-1, 12))
# .plot.bar()画柱状图
# .plot.line()画线状图
# secondary_y = True 设置第二个 y 轴
# ax = ax 使得两个图形在同一个图中
# .set_xlim()设定 x 轴底坐标范围