@1007477689
2020-05-19T02:28:58.000000Z
字数 13351
阅读 1278
Pandas学习练习
Python Pandas
import pandas as pdpd.__version__
arry = [0, 1, 2, 3, 4]df = pd.Series(data = arry)df# 如果不指定索引,则默认从 0 开始
>>> 0 01 12 23 34 4dtype: int64
data = {'a':1, 'b':2, 'c':3, 'd':4, 'e':5}df = pd.Series(data = data)df
>>> a 1b 2c 3d 4e 5dtype: int64
import numpy as npdates = pd.date_range(start = 'today', periods = 6)num_arr = np.random.randn(6, 4)columns = ['A', 'B', 'C', 'D']df = pd.DataFrame(data = num_arr, index = dates, columns = columns)df# pd.date_range('today', periods = 6) 定义时间序列作为 index# np.random.randn(6, 4) 传入 numpy 随机数组# columns = ['A', 'B', 'C', 'D'] 将列表作为列名
df = pd.read_csv(filepath_or_buffer = 'test.csv', encoding = 'gbk', sep = ';')dfdf.to_csv('test.csv')# df.to_csv('output.csv') 将df存储为csv文件,文件名为output.csv
import numpy as npdata = {'animal':['cat', 'cat', 'snake', 'dog', 'dog', 'cat', 'snake', 'cat', 'dog', 'dog'],'age': [2.5, 3, 0.5, np.nan, 5, 2, 4.5, np.nan, 7, 3],'visits': [1, 3, 2, 3, 2, 3, 1, 1, 2, 1],'priority': ['yes', 'yes', 'no', 'yes', 'no', 'no', 'no', 'yes', 'no', 'no']}index_labels = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j']df = pd.DataFrame(data = data, index = index_labels)df
# 方法一df.info()
# 方法二df.describe()
# 方法一df.iloc[:3]
# 方法二df.head(n = 3)
# 方法一df.loc[:, ['animal', 'age']]
# 方法二df[['animal', 'age']]# 里面还有一个[]
df.loc[df.index[[3, 4, 8]], ['animal', 'age']]
df[df['age'] > 3]
df['age'].isnull()
>>> a Falseb Falsec Falsed True...g Falseh Truei Falsej FalseName: age, dtype: bool
df[df['age'].isnull()]
# 方法一df[(df['age'] > 2) & (df['age'] < 4)]
# 方法二df[df['age'].between(2, 4)]
df.loc['f', 'age'] = 1.5df
df['visits'].sum()
df.groupby('animal')['age'].mean()
# 插入行df.loc['k'] = [5.5, 'dog', 'no', 2]# 删除行df = df.drop('k')
df['animal'].value_counts()# .value_counts() 每种的值
df.sort_values(by = ['age', 'visits'], ascending = [False, True])# False 为降序,True 为升序
df['priority'] = df['priority'].map({'yes': True, 'no': False})df
df['animal'] = df['animal'].replace(to_replace = 'snake', value = 'python')df
df.age.dtypes
df.age = df.age.astype(dtype = 'float')# 这里要把 age 转换为 'float'# 如果不转换,直接调用df.pivot_table(index = 'animal', columns = 'visits', values = 'age', aggfunc = 'mean')会出错df.age.dtypes
>>> dtype('float64')
df.pivot_table(index = 'animal', columns = 'visits', values = 'age', aggfunc = 'mean')
data = {'A': [1, 2, 2, 3, 2, 5, 5, 5, 6, 7, 7]}df = pd.DataFrame(data = data)
# 方法一:df1 = df.loc[df['A'].shift() != df['A']]print(df1)# df[column].shift(period = 1, axis = 0)# 方法一只能删除连续行重复的行,不连续的重复行不能删除。
# 方法二:df1 = df.drop_duplicates(subset = 'A')# drop_duplicate() 方法是对 DataFrame 格式的数据,去除特定列下面的重复行。返回 DataFrame 格式的数据。
df = pd.DataFrame(data = np.random.random(size = (5, 3)))df_sub = df.sub(other = df.mean(axis = 1), axis = 0)# df.sub(other = df.mean(axis = 1), axis = 0)先取这一行的平均数,然后按照列来减。
list('abcde')
>>> ['a', 'b', 'c', 'd', 'e']
df = pd.DataFrame(data = np.random.random(size = (5, 5)), columns = list('abcde'))print(df)df.sum().idxmin()
df = pd.DataFrame({'A': list('aaabbcaabcccbbc'),'B': [12, 345, 3, 1, 45, 14, 4, 52, 54, 23, 235, 21, 57, 3, 87]})df1 = df.groupby(by = 'A')['B'].nlargest(n = 3)
>>> Aa 1 3457 520 12b 12 578 544 45c 10 23514 879 23Name: B, dtype: int64
df1 = df.groupby(by = 'A')['B'].nlargest(n = 3).sum(level = 0)# level 如果轴是多指标(分层),则沿特定级别计算,并折叠成一个序列。
>>> Aa 409 (=345 + 52 + 12)b 156 (= 57 + 54 + 45)c 345 (= 235 + 87 + 23)Name: B, dtype: int64
data = {'A': [1, 2, 11, 11, 33, 34, 35, 40, 79, 99],'B': [1, 2, 11, 11, 33, 34, 35, 40, 79, 99]}df = pd.DataFrame(data = data)groupby_rule = pd.cut(x = df['A'], bins = np.arange(start = 0, stop = 101, step = 10))groupby_rule
>>> 0 (0, 10]1 (0, 10]2 (10, 20]3 (10, 20]4 (30, 40]5 (30, 40]6 (30, 40]7 (30, 40]8 (70, 80]9 (90, 100]Name: A, dtype: categoryCategories (10, interval[int64]): [(0, 10] < (10, 20] < (20, 30] < (30, 40] ... (60, 70] < (70, 80] < (80, 90] < (90, 100]]
df1 = df.groupby(by = groupby_rule)['B'].sum()df1# pandas.cut(x, bins, right = True, labels = None, retbins = False, precision = 3, include_lowest = False, duplicates = 'raise')# x:被切分的类数组(array-like)数据,必须是 1 维的(不能用DataFrame)# bins:bins 是被切割后的区间,有3中形式:一个 int 型的标量、标量序列(数组)或者 pandas.IntervalIndex
>>> A(0, 10] 3(10, 20] 22(20, 30] 0(30, 40] 142(40, 50] 0(50, 60] 0(60, 70] 0(70, 80] 79(80, 90] 0(90, 100] 99Name: B, dtype: int64
data = {'X': [7, 2, 0, 3, 4, 2, 5, 0, 3, 4]}df = pd.DataFrame(data = data)
# 方法一izero = np.r_[-1, (df['X'] == 0).to_numpy().nonzero()[0]]# 标记0的位置idx = np.arange(len(df))df['Y'] = idx - izero[np.searchsorted(izero - 1, idx) - 1]print(df)
# 方法二x = (df['X'] != 0).cumsum()y = x != x.shift()df['Y'] = y.groupby((y != y.shift()).cumsum()).cumsum()
# 方法三df['Y'] = df.groupby((df['X'] == 0).cumsum()).cumcount()first_zero_idx = (df['X'] == 0).idxmax()df['Y'].iloc[0:first_zero_idx] += 1
df = pd.DataFrame(data = np.random.random(size = (5, 3)))
df.unstack()
>>> 0 0 0.4811961 0.3756532 0.1508753 0.0492214 0.0913841 0 0.5234621 0.9516142 0.0975903 0.8269704 0.5897552 0 0.8933461 0.9662502 0.8118243 0.6581634 0.863555dtype: float64
df.unstack().sort_values()
>>> 0 3 0.0492214 0.0913841 2 0.0975900 2 0.1508751 0.3756530 0.4811961 0 0.5234624 0.5897552 3 0.6581632 0.8118241 3 0.8269702 4 0.8635550 0.8933461 1 0.9516142 1 0.966250dtype: float64
df.unstack().sort_values()[-3:].index.tolist()# .sort_values()[-3:]从倒数第三大个值往上升# .index.tolist() 取得它们的坐标
data = {'grps': list('aaabbcaabcccbbc'),'vals': [-12, 345, 3, 1, 45, 14, 4, -52, 54, 23, -235, 21, 57, 3, 87]}df = pd.DataFrame(data = data)def replace(group):mask = group < 0group[mask] = group[~mask].mean()return groupdf['vals'] = df.groupby(['grps'])['vals'].transform(replace)print(df)# transform(函数) -> 返回一个列向量,该列向量和原子数据帧等长,然后各个子数据帧还是和原来一样的行数# apply() 与transform() 的相同点与不同点# 相同点:# 都能针对 dataframe 完成特征的计算,并且常常与groupby()方法一起使用。# 不同点:# apply()里面可以跟自定义的函数,包括简单的求和函数以及复杂的特征间的差值函数等# (注:apply不能直接使用agg()方法 / transform()中的python内置函数,例如sum、max、min、’count‘等方法)# transform() 里面不能跟自定义的特征交互函数.# 因为transform是真针对每一元素(即每一列特征操作)进行计算,也就是说在使用 transform() 方法时,需要记得三点:# 1、它只能对每一列进行计算,所以在groupby()之后,.transform()之前是要指定要操作的列,这点也与apply有很大的不同。# 2、由于是只能对每一列计算,所以方法的通用性相比apply()就局限了很多,例如只能求列的最大/最小/均值/方差/分箱等操作# 3、transform还有什么用呢?最简单的情况是试图将函数的结果分配回原始的dataframe。也就是说返回的shape是(len(df),1)。# 注:如果与groupby()方法联合使用,需要对值进行去重。
df = pd.DataFrame({'group': list('aabbabbbabab'),'value': [1, 2, 3, np.nan, 2, 3, np.nan, 1, 7, 3, np.nan, 8]})g1 = df.groupby(['group'])['value']g2 = df.fillna(0).groupby(['group'])['value']s = g2.rolling(3, min_periods = 1).sum() / g1.rolling(3, min_periods = 1).count()s.reset_index(level = 0, drop = True).sort_index()
dti = pd.date_range(start = '2015-01-01', end = '2015-12-31', freq = 'B')dti
>>> DatetimeIndex(['2015-01-01', '2015-01-02', '2015-01-05', '2015-01-06', '2015-01-07','2015-01-08', '2015-01-09', '2015-01-12', '2015-01-13', '2015-01-14',...'2015-12-18', '2015-12-21', '2015-12-22', '2015-12-23', '2015-12-24','2015-12-25', '2015-12-28', '2015-12-29', '2015-12-30', '2015-12-31'],dtype = 'datetime64[ns]', length = 261, freq = 'B')
s = pd.Series(data = np.random.rand(len(dti)), index = dti)s.head(10)# 这里 2015 所有工作日已经作为随机值的索引了,只是在这里取了前十个而已
>>> 2015-01-01 0.6681152015-01-02 0.9593742015-01-05 0.0324942015-01-06 0.7755842015-01-07 0.4241582015-01-08 0.9010542015-01-09 0.6647292015-01-12 0.2214972015-01-13 0.8399772015-01-14 0.440950Freq: B, dtype: float64
s.index.weekday == 3
>>> array([ True, False, False, False, False, True, False, False, False,...,False, False, False, True, False, False, False, False, True])
s[s.index.weekday == 3].sum()# 星期三对应的是 weekday = 3,而不是 2
s.resample(rule = 'M')# DataFrame.resample(rule, axis, closed)# 用 resample 后,得到的每个月是月底
# Convenience method for frequency conversion and resampling of time series. Object must have a datetime-like index (DatetimeIndex, PeriodIndex, or TimedeltaIndex), or pass datetime-like values to the on or level keyword.
>>> DatetimeIndexResampler [freq = <MonthEnd>, axis = 0, closed = right, label = right, convention = start, base = 0]
s.resample(rule = 'M').mean()
>>> 2015-01-31 0.5181752015-02-28 0.5258242015-03-31 0.5308552015-04-30 0.4787072015-05-31 0.4398202015-06-30 0.5228422015-07-31 0.5668642015-08-31 0.4584532015-09-30 0.4771202015-10-31 0.5259012015-11-30 0.4056652015-12-31 0.453303Freq: M, dtype: float64
s.groupby(pd.Grouper(freq = '4M')).idxmax()# 前面的索引为这一组的头,后面的值为最大值所在的日期。
pd.date_range(start = '2015-01-01', end = '2016-12-31', freq = 'WOM-3THU')# WOM-1MON、WOM-2MON… WeekOfMonth 产生每月第一、二、三、四周的星期几,例如WOM-1MON表示每月的第一个星期一# 参考文献:https://blog.csdn.net/misxu890312/article/details/100192129。
>>> DatetimeIndex(['2015-01-15', '2015-02-19', '2015-03-19', '2015-04-16', '2015-05-21','2015-06-18', '2015-07-16', '2015-08-20', '2015-09-17', '2015-10-15','2015-11-19', '2015-12-17', '2016-01-21', '2016-02-18', '2016-03-17','2016-04-21', '2016-05-19', '2016-06-16', '2016-07-21', '2016-08-18','2016-09-15', '2016-10-20', '2016-11-17', '2016-12-15'],dtype = 'datetime64[ns]', freq = 'WOM-3THU')
df['FlightNumber'] = df['FlightNumber'].interpolate().astype(int)df# .interpolate() 会将值等距离插值,这样就使得缺失值等距离填充进去了。
data = {'From_To': ['LoNDon_paris', 'MAdrid_miLAN', 'londON_StockhOlm', 'Budapest_PaRis', 'Brussels_londOn'],'FlightNumber': [10045, np.nan, 10065, np.nan, 10085],'RecentDelays': [[23, 47], [], [24, 43, 87], [13], [67, 32]],'Airline': ['KLM(!)', '<Air France> (12)', '(British Airways. )', '12. Air France', '"Swiss Air"']}df = pd.DataFrame(data = data)df
>>> From_To Flight Number RecentDelays Airline0 LoNDon_paris 10045.0 [23, 47] KLM(!)1 MAdrid_miLAN NaN [] <Air France> (12)2 londON_StockhOlm 10065.0 [24, 43, 87] (British Airways. )3 Budapest_PaRis NaN [13] 12. Air France4 Brussels_londOn 10085.0 [67, 32] "Swiss Air"
df['FlightNumber'] = df['FlightNumber'].interpolate().astype(int)temp = df.From_To.str.split('_', expand = True)temp.columns = ['From', 'To']df = df.join(temp)df = df.drop('From_To', axis = 1)df# df.drop('From_To', axis = 1)# expand : 布尔值,默认为False.如果为真返回数据框(DataFrame)或复杂索引(MultiIndex);# 如果为假,返回序列(Series)或者索引(Index)# temp.columns = ['From', 'To']使得将拆分后形成的DataFrame两列命名为From和To# 调用df.mean(axis=1),我们将得到按行计算的均值。# 然而,如果我们调用 df.drop((name, axis=1),我们实际上删掉了一列,而不是一行。
df['From'] = df['From'].str.capitalize()df['To'] = df['To'].str.capitalize()df# 使得首字母变为大写,其余变为小写。
df['Airline'] = df['Airline'].str.extract(pat = '([a-zA-Z\s]+)', expand = False).str.strip()df# .str.extract(pat = '([a-zA-Z\s]+)', expand = False) 删除了这些data中的标点符号# .str.strip() 方法用于移除字符串头尾指定的字符(默认为空格或换行符)或字符序列。# 注意:该方法只能删除开头或是结尾的字符,不能删除中间部分的字符。
delays = df['RecentDelays'].apply(func = pd.Series)# 使得列中数组被拆分为 Series# delays = df['RecentDelays'].apply(func = pd.Series)
>>> 0 1 20 23.0 47.0 NaN1 NaN NaN NaN2 24.0 43.0 87.03 13.0 NaN NaN4 67.0 32.0 NaN
delays.columns
>>> RangeIndex(start = 0, stop = 3, step = 1)
# delays.columns = ['delay_{}'.format(n) for n in range(1, len(delays.columns) + 1)]delays.columns = [f'delay_{num}' for num in range(start = 1, stop = len(delays.columns) + 1)]delays# [f'delay_{num}' for num in range(start = 1, stop = len(delays.columns) + 1)] 使得列名循环。
>>> Index(['delay_1', 'delay_2', 'delay_3'], dtype = 'object')
df = df.drop('RecentDelays', axis = 1).join(delays)df
letters = ['A', 'B', 'C']numbers = list(range(4))mi = pd.MultiIndex.from_product([letters, numbers])# pd.MultiIndex.from_product()创建一个多层索引,这个多层索引中ABC分别对应四个值。
>>> MultiIndex(levels = [['A', 'B', 'C'], [0, 1, 2, 3]],labels = [[0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2], [0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3]])>>> MultiIndex([('A', '0'),('A', '1'),('A', '2'),('A', '3'),('B', '0'),('B', '1'),('B', '2'),('B', '3'),('C', '0'),('C', '1'),('C', '2'),('C', '3')])
s = pd.Series(data = np.random.rand(12), index = mi)s
>>> A 0 0.7965461 0.5541682 0.9042863 0.266291B 0 0.6493491 0.5020572 0.0232913 0.117036C 0 0.8892271 0.9079102 0.0189233 0.038807dtype: float64
# 方法一s.index.is_lexsorted()
# 方法二s.index.lexsort_depth == s.index.nlevels# 如果按照字典顺序排序,那么.is_lexsorted()的返回值为True。
s.loc[:, [1, 3]]# 第一个:使得一级索引从首到末# 1, 3为二级索引的第一行和第三行,如果为1:3这表示一行到三行。
# 方法一s.loc[pd.IndexSlice[:'B', 2:]]
# 方法二# s.loc[slice(None, 'B'), slice(2, None)]
#方法一s.sum(level = 0)
#方法二#s.unstack().sum(axis = 0)# s.sum(level = 0)表示计算每一级的和,如果为s.sum()则计算所有数据的和。
new_s = s.swaplevel(0, 1)print(new_s)print(new_s.index.is_lexsorted())new_s = new_s.sort_index()print(new_s)# s.swaplevel(0, 1)交换第一维和第二维的索引等级。# .sort_index()索引排序。
import matplotlib.pyplot as pltdata = {"xs": [1, 5, 2, 8, 1],"ys": [4, 2, 1, 9, 6]}df = pd.DataFrame(data = data)plt.style.use(style = 'ggplot')# plt.style.use定制画布风格# 参考文献:https://zhuanlan.zhihu.com/p/37891729df.plot.scatter(x = "xs", y = "ys", color = "black", marker = "x")# .plot.scatter()绘制散点图<matplotlib.axes._subplots.AxesSubplot at 0x18387eb3d88>
data = {"productivity": [5, 2, 3, 1, 4, 5, 6, 7, 8, 3, 4, 8, 9],"hours_in": [1, 9, 6, 5, 3, 9, 2, 9, 1, 7, 4, 2, 2],"happiness": [2, 1, 3, 2, 3, 1, 2, 3, 1, 2, 2, 1, 3],"caffienated": [0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0]}df = pd.DataFrame(data = data)
df.plot.scatter(x = "hours_in", y = "productivity", s = df.happiness * 100, c = df.caffienated)# s 标记大小# c 标记颜色
data = {"revenue": [57, 68, 63, 71, 72, 90, 80, 62, 59, 51, 47, 52],"advertising": [2.1, 1.9, 2.7, 3.0, 3.6, 3.2, 2.7, 2.4, 1.8, 1.6, 1.3, 1.9],"month": range(12)}df = pd.DataFrame(data = data)
ax = df.plot.bar(x = "month", y = "revenue", color = "green")df.plot.line(x = "month", y = "advertising", secondary_y = True, ax = ax)ax.set_xlim((-1, 12))# .plot.bar()画柱状图# .plot.line()画线状图# secondary_y = True 设置第二个 y 轴# ax = ax 使得两个图形在同一个图中# .set_xlim()设定 x 轴底坐标范围