HTML版本点击此处
# 导入所需的库 import pandas as pd import matplotlib.pyplot as plt import numpy.random as np import sys import matplotlib %matplotlib inline # 打印版本信息 print('Python version ' + sys.version) print('Pandas version: ' + pd.__version__) print('Matplotlib version ' + matplotlib.__version__) 12345678910111213
Python version 3.6.5 |Anaconda, Inc.| (default, Apr 29 2018, 16:14:56) [GCC 7.2.0] Pandas version: 0.23.0 Matplotlib version 2.2.2 1234
# 设置随机种子保证结果相同 np.seed(111) # 创建测试数据的函数,默认记录数为1 def CreateDataSet(Number=1): Output = [] for i in range(Number): # 创建以周为单位的日期列表,其中freq='W-MON'表示以周为频率,且计入每周周一 rng = pd.date_range(start='1/1/2009',end='12/31/2012',freq='W-MON') # 创建rng中每一天的数据,size参数表示数量 data = np.randint(low=25,high=1000,size=len(rng)) # 状态列表 status = [1,2,3] # 创建随机状态列表,包含1,2,3三种状态 random_status = [status[np.randint(0,len(status))] for i in range(len(rng))] # 状态池 states = ['GA','FL','fl','NY','NJ','TX'] random_states = [states[np.randint(0,len(states))] for i in range(len(rng))] # 将我们得到的四个随机列表合并 Output.extend(zip(random_states,random_status,data,rng)) return Output
123456789101112131415161718192021222324252627282930dataset = CreateDataSet() df = pd.DataFrame(data=dataset,columns=['State','Status','CustomerCount','StatusDate']) df.info() 123
<class 'pandas.core.frame.DataFrame'> RangeIndex: 209 entries, 0 to 208 Data columns (total 4 columns): State 209 non-null object Status 209 non-null int64 CustomerCount 209 non-null int64 StatusDate 209 non-null datetime64[ns] dtypes: datetime64[ns](1), int64(2), object(1) memory usage: 6.6+ KB 123456789
# 比较两次结果的差异,number指定为4之后年份是从09-12的四次循环 dataset = CreateDataSet(4) df = pd.DataFrame(data=dataset,columns=['State','Status','CustomerCount','StatusDate']) df.info() 1234
<class 'pandas.core.frame.DataFrame'> RangeIndex: 836 entries, 0 to 835 Data columns (total 4 columns): State 836 non-null object Status 836 non-null int64 CustomerCount 836 non-null int64 StatusDate 836 non-null datetime64[ns] dtypes: datetime64[ns](1), int64(2), object(1) memory usage: 26.2+ KB 123456789
df.head() 1 StateStatusCustomerCountStatusDate0fl31222009-01-051fl39552009-01-122NY15222009-01-193fl34362009-01-264TX23982009-02-02
# 用dataFrame的to_excel()函数将结果保存为excel格式文件 df.to_excel('Lesson3.xlsx',index=False) print('Done!') 1234
Done! 1
# 用pd.read_excel()从excel文件中读取数据 # 路径位置,windows下需要用r' ' 修正带反斜杠的路径 Location = 'Lesson3.xlsx' # index_col 参数传入指定的索引列,可参考下面的df.index df = pd.read_excel(Location,0,index_col='StatusDate') df.dtypes 12345678
State object Status int64 CustomerCount int64 dtype: object 1234
df.index 1
DatetimeIndex(['2009-01-05', '2009-01-12', '2009-01-19', '2009-01-26', '2009-02-02', '2009-02-09', '2009-02-16', '2009-02-23', '2009-03-02', '2009-03-09', ... '2012-10-29', '2012-11-05', '2012-11-12', '2012-11-19', '2012-11-26', '2012-12-03', '2012-12-10', '2012-12-17', '2012-12-24', '2012-12-31'], dtype='datetime64[ns]', name='StatusDate', length=836, freq=None) 12345678
# 注意比较,第一列变成了索引列,而不再是默认的0,1,2... df.head() 12 StateStatusCustomerCountStatusDate2009-01-05fl31222009-01-12fl39552009-01-19NY15222009-01-26fl34362009-02-02TX2398
""" 检查数据的正确性 - state列的字母都要大写 - 只选择status列为1的数据 - 将state列的NJ与NY合并为NY - 删除一些很大或者很小的特殊值(customerCount列) """ df['State'].unique() 12345678910
array(['fl', 'NY', 'TX', 'GA', 'NJ', 'FL'], dtype=object) 1
# 把state列都转化为大写,运用lambda表达式 df['State'] = df.State.apply(lambda x: x.upper()) df['State'].unique() 1234
array(['FL', 'NY', 'TX', 'GA', 'NJ'], dtype=object) 1
# 只选中status 列为1的记录 # mask掩膜 mask = df['Status'] == 1 df = df[mask] df.head() 1234567 StateStatusCustomerCountStatusDate2009-01-19NY15222009-03-16NY13552009-03-23GA19722009-04-20NY18202009-04-27NY1447
# 把NJ与NY合并成NY mask = df.State == 'NJ' df.State[mask] = 'NY' df.head(10) 123456 StateStatusCustomerCountStatusDate2009-01-19NY15222009-03-16NY13552009-03-23GA19722009-04-20NY18202009-04-27NY14472009-05-11NY19872009-05-18TX13772009-05-25NY13782009-06-08TX1452009-06-22TX1896
df['State'].unique() 1
array(['NY', 'GA', 'TX', 'FL'], dtype=object) 1
# 绘制顾客数量的变化图, plot(), figsize()指定图的大小 df['CustomerCount'].plot(figsize=(15,5)) 123
<matplotlib.axes._subplots.AxesSubplot at 0x7f33f95be160> 1
# 将state为'NY'的数据依照index列进行排序, sortdf = df[df['State']=='NY'].sort_index(axis=0) sortdf.head(10) 1234 StateStatusCustomerCountStatusDate2009-01-19NY15222009-02-23NY17102009-03-16NY13552009-03-23NY17282009-03-30NY18632009-04-13NY15202009-04-20NY18202009-04-20NY19372009-04-27NY14472009-05-04NY1497
""" 接下来我们将用groupby(),sum()等函数得到顾客数/天/状态 因为我们需要将statusDate与state作为分组依据传入groupby函数,而groupby接受的 参数类型是columns,所以我们需要先用reset_index()把statusDate从index转回column """ # 注意State,StatusDate的顺序 Daily = df.reset_index().groupby(['State','StatusDate']).sum() Daily.head() 123456789 StatusCustomerCountStateStatusDateFL2009-01-1213192009-02-0216532009-03-2317522009-04-0617952009-04-201788
""" 从上面的结果中我们可以看到State和StatusDate列自动变成了 index列,我们可以把index理解为数据库里的表的主键,只是数据帧里的 index列不要求唯一,我们可以通过这个index来进行选择,绘图,计算操作 """ # 删除Status 列 del Daily['Status'] Daily.head() 123456789 CustomerCountStateStatusDateFL2009-01-123192009-02-026532009-03-237522009-04-067952009-04-20788
# 输出现在Daily的index,得到的结果是MultiIndex多重索引 Daily.index 123
MultiIndex(levels=[['FL', 'GA', 'NY', 'TX'], [2009-01-12 00:00:00, 2009-01-19 00:00:00, 2009-02-02 00:00:00, 2009-02-23 00:00:00, 2009-03-16 00:00:00, 2009-03-23 00:00:00, 2009-03-30 00:00:00, 2009-04-06 00:00:00, 2009-04-13 00:00:00, 2009-04-20 00:00:00, 2009-04-27 00:00:00, 2009-05-04 00:00:00, 2009-05-11 00:00:00, 2009-05-18 00:00:00, 2009-05-25 00:00:00, 2009-06-08 00:00:00, 2009-06-22 00:00:00, 2009-07-06 00:00:00, 2009-07-13 00:00:00, 2009-07-20 00:00:00, 2009-07-27 00:00:00, 2009-08-10 00:00:00, 2009-08-17 00:00:00, 2009-08-24 00:00:00, 2009-08-31 00:00:00, 2009-09-07 00:00:00, 2009-09-14 00:00:00, 2009-09-21 00:00:00, 2009-09-28 00:00:00, 2009-10-05 00:00:00, 2009-10-12 00:00:00, 2009-10-19 00:00:00, 2009-10-26 00:00:00, 2009-11-02 00:00:00, 2009-11-23 00:00:00, 2009-11-30 00:00:00, 2009-12-07 00:00:00, 2009-12-14 00:00:00, 2009-12-21 00:00:00, 2010-01-04 00:00:00, 2010-01-11 00:00:00, 2010-01-18 00:00:00, 2010-01-25 00:00:00, 2010-02-08 00:00:00, 2010-02-15 00:00:00, 2010-02-22 00:00:00, 2010-03-01 00:00:00, 2010-03-08 00:00:00, 2010-03-15 00:00:00, 2010-03-29 00:00:00, 2010-04-05 00:00:00, 2010-04-12 00:00:00, 2010-04-19 00:00:00, 2010-04-26 00:00:00, 2010-05-03 00:00:00, 2010-05-10 00:00:00, 2010-05-17 00:00:00, 2010-05-24 00:00:00, 2010-05-31 00:00:00, 2010-06-14 00:00:00, 2010-06-21 00:00:00, 2010-06-28 00:00:00, 2010-07-05 00:00:00, 2010-07-19 00:00:00, 2010-07-26 00:00:00, 2010-08-02 00:00:00, 2010-08-16 00:00:00, 2010-08-30 00:00:00, 2010-09-06 00:00:00, 2010-09-13 00:00:00, 2010-09-20 00:00:00, 2010-09-27 00:00:00, 2010-10-04 00:00:00, 2010-10-11 00:00:00, 2010-10-18 00:00:00, 2010-10-25 00:00:00, 2010-11-01 00:00:00, 2010-11-08 00:00:00, 2010-11-15 00:00:00, 2010-11-29 00:00:00, 2010-12-20 00:00:00, 2011-01-03 00:00:00, 2011-01-10 00:00:00, 2011-01-17 00:00:00, 2011-01-31 00:00:00, 2011-02-07 00:00:00, 2011-02-14 00:00:00, 2011-02-21 00:00:00, 2011-02-28 00:00:00, 2011-03-07 00:00:00, 2011-03-14 00:00:00, 2011-04-04 00:00:00, 2011-04-18 00:00:00, 2011-04-25 00:00:00, 2011-05-02 00:00:00, 2011-05-09 00:00:00, 2011-05-16 00:00:00, 2011-05-23 00:00:00, 2011-05-30 00:00:00, 2011-06-13 00:00:00, 2011-06-20 00:00:00, 2011-06-27 00:00:00, 2011-07-04 00:00:00, 2011-07-11 00:00:00, 2011-07-25 00:00:00, 2011-08-01 00:00:00, 2011-08-08 00:00:00, 2011-08-15 00:00:00, 2011-08-29 00:00:00, 2011-09-05 00:00:00, 2011-09-12 00:00:00, 2011-09-19 00:00:00, 2011-09-26 00:00:00, 2011-10-03 00:00:00, 2011-10-24 00:00:00, 2011-10-31 00:00:00, 2011-11-07 00:00:00, 2011-11-14 00:00:00, 2011-11-28 00:00:00, 2011-12-05 00:00:00, 2011-12-19 00:00:00, 2011-12-26 00:00:00, 2012-01-02 00:00:00, 2012-01-16 00:00:00, 2012-01-23 00:00:00, 2012-01-30 00:00:00, 2012-02-06 00:00:00, 2012-02-13 00:00:00, 2012-02-20 00:00:00, 2012-02-27 00:00:00, 2012-03-05 00:00:00, 2012-03-12 00:00:00, 2012-03-19 00:00:00, 2012-03-26 00:00:00, 2012-04-02 00:00:00, 2012-04-09 00:00:00, 2012-04-23 00:00:00, 2012-04-30 00:00:00, 2012-05-07 00:00:00, 2012-05-14 00:00:00, 2012-05-28 00:00:00, 2012-06-04 00:00:00, 2012-06-18 00:00:00, 2012-07-09 00:00:00, 2012-07-16 00:00:00, 2012-07-30 00:00:00, 2012-08-06 00:00:00, 2012-08-20 00:00:00, 2012-08-27 00:00:00, 2012-09-03 00:00:00, 2012-09-10 00:00:00, 2012-09-17 00:00:00, 2012-09-24 00:00:00, 2012-10-01 00:00:00, 2012-10-08 00:00:00, 2012-10-15 00:00:00, 2012-10-22 00:00:00, 2012-10-29 00:00:00, 2012-11-05 00:00:00, 2012-11-12 00:00:00, 2012-11-19 00:00:00, 2012-11-26 00:00:00, 2012-12-10 00:00:00, 2012-12-17 00:00:00]], labels=[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3], [0, 2, 5, 7, 9, 15, 16, 17, 18, 21, 23, 25, 26, 28, 29, 33, 36, 40, 42, 43, 44, 45, 47, 48, 50, 53, 55, 57, 58, 61, 62, 64, 67, 69, 70, 71, 72, 73, 77, 78, 79, 80, 87, 89, 90, 92, 93, 96, 97, 100, 103, 104, 105, 108, 109, 110, 113, 115, 118, 119, 124, 126, 127, 128, 129, 132, 137, 138, 139, 142, 147, 151, 152, 153, 154, 159, 163, 5, 6, 20, 21, 22, 25, 26, 32, 33, 35, 41, 46, 49, 50, 56, 59, 67, 68, 70, 72, 74, 75, 76, 80, 83, 85, 87, 91, 93, 95, 106, 109, 110, 123, 128, 133, 135, 141, 160, 161, 162, 1, 3, 4, 5, 6, 8, 9, 10, 11, 12, 14, 17, 19, 22, 24, 26, 27, 29, 30, 31, 34, 37, 38, 39, 40, 41, 51, 52, 55, 56, 63, 65, 71, 73, 74, 76, 79, 81, 82, 83, 84, 88, 91, 92, 94, 98, 101, 102, 103, 105, 107, 108, 111, 112, 113, 114, 117, 122, 123, 125, 126, 130, 131, 132, 134, 140, 141, 143, 144, 145, 147, 149, 150, 156, 158, 160, 13, 15, 16, 46, 50, 54, 57, 58, 60, 66, 69, 77, 86, 90, 92, 98, 99, 107, 113, 116, 120, 121, 131, 136, 137, 138, 142, 144, 146, 147, 148, 155, 157]], names=['State', 'StatusDate']) 123
# 只获取state索引 Daily.index.levels[0] 12
Index(['FL', 'GA', 'NY', 'TX'], dtype='object', name='State') 1
# 获取statusDate索引 Daily.index.levels[1] 12
DatetimeIndex(['2009-01-12', '2009-01-19', '2009-02-02', '2009-02-23', '2009-03-16', '2009-03-23', '2009-03-30', '2009-04-06', '2009-04-13', '2009-04-20', ... '2012-10-08', '2012-10-15', '2012-10-22', '2012-10-29', '2012-11-05', '2012-11-12', '2012-11-19', '2012-11-26', '2012-12-10', '2012-12-17'], dtype='datetime64[ns]', name='StatusDate', length=164, freq=None) 12345678
# 可视化,把每一个state都绘制出来,用loc定位到每一个state Daily.loc['FL'].plot() Daily.loc['GA'].plot() Daily.loc['NY'].plot() Daily.loc['TX'].plot(); 12345
# 我们也可以在指定state的基础上指定年份,如绘制2012年的顾客数 Daily.loc['FL']['2012':].plot() Daily.loc['GA']['2012':].plot() Daily.loc['NY']['2012':].plot() Daily.loc['TX']['2012':].plot(); 12345
""" 通过上面的图像,我们可以寻找异常值(超过特定范围)然后删除异常值,确保我们最终 得到平滑的图像。 StateYearMonth -这里我们将依照state,year,month(statusDate的属性)来分组 Daily['Outlier'] - 异常值 注意我们这里用的是transfrom而不是apply函数,因为transform 能保证我们数据帧的行 数与列数不变。此外,从上面的图像中可看到,数据不是高斯分布的,所以我们在指定正常 范围的时候不能用stddev与mean等标准,这里我们用的是百分数 """ # 分组 StateYearMonth = Daily.groupby([Daily.index.get_level_values(0), Daily.index.get_level_values(1).year, Daily.index.get_level_values(1).month]) # 下界 Daily['Lower'] = StateYearMonth['CustomerCount'].transform( lambda x: x.quantile(q=.25) - (1.5*x.quantile(q=.75)-x.quantile(q=.25)) ) # 上界 Daily['Upper'] = StateYearMonth['CustomerCount'].transform( lambda x: x.quantile(q=.75) + (1.5*x.quantile(q=.75)-x.quantile(q=.25)) ) Daily['Outlier'] = (Daily['CustomerCount'] < Daily['Lower']) | (Daily['CustomerCount'] > Daily['Upper']) # 删除异常值 Daily = Daily[Daily['Outlier'] == False] Daily.head()
1234567891011121314151617181920212223 CustomerCountLowerUpperOutlierStateStatusDateFL2009-01-12319159.500478.500False2009-02-02653326.500979.500False2009-03-23752376.0001128.000False2009-04-06795389.6251193.375False2009-04-20788389.6251193.375False# 接下来我们把所有同一天内但state不同的数据加起来并得到每个月的最大顾客数量 ALL = pd.DataFrame(Daily['CustomerCount'].groupby(Daily.index.get_level_values(1)).sum()) ALL.columns = ['CustomerCount'] YearMonth = ALL.groupby([lambda x:x.year,lambda x:x.month]) # 得到每年和每个月最大的客流量 ALL['Max'] = YearMonth['CustomerCount'].transform(lambda x:x.max()) ALL.head() 123456789101112 CustomerCountMaxStatusDate2009-01-123195222009-01-195225222009-02-026537102009-02-237107102009-03-163552452
""" 我们给客流量设上目标 - 12/31/2011 - 1,000 - 12/31/2012 - 2,000 - 12/31/2013 - 3,000 """ data = [1000,2000,3000] #这里把freq参数设置为a表示annual 每年 idx = pd.date_range(start='12/31/2011',end='12/31/2013',freq='A') BHAG = pd.DataFrame(data,index=idx,columns=['BHAG']) BHAG 12345678910111213 BHAG2011-12-3110002012-12-3120002013-12-313000
# 把BHAG和ALL 数据集合并 combined = pd.concat([ALL,BHAG],axis=0) combined = combined.sort_index(axis=0) combined.tail() 12345
/home/dreamboy/anaconda3/lib/python3.6/site-packages/ipykernel_launcher.py:3: FutureWarning: Sorting because non-concatenation axis is not aligned. A future version of pandas will change to not sort by default. To accept the future behavior, pass 'sort=True'. To retain the current behavior and silence the warning, pass sort=False This is separate from the ipykernel package so we can avoid doing imports until 12345678 BHAGCustomerCountMax2012-11-26NaN367.01042.02012-12-10NaN769.0817.02012-12-17NaN817.0817.02012-12-312000.0NaNNaN2013-12-313000.0NaNNaN
fig,axes = plt.subplots(figsize=(12,7)) # 这里的method设置为pad表示用前一个非缺失值去替换该缺失值 combined['BHAG'].fillna(method='pad').plot(color='green', label='BHAG') combined['Max'].plot(color='blue', label='All Markets') plt.legend(loc='best'); 123456
# 接下来我们来试着预测明年的顾客量 Year = combined.groupby(lambda x:x.year).max() Year 1234 BHAGCustomerCountMax2009NaN2545.02545.02010NaN1819.01819.020111000.01957.01957.020122000.01689.01689.020133000.0NaNNaN
# 增加用来表示每年变化情况的一列 Year['YR_PCT_CHANGE'] = Year['Max'].pct_change(periods=1) Year 123 BHAGCustomerCountMaxYR_PCT_CHANGE2009NaN2545.02545.0NaN2010NaN1819.01819.0-0.28526520111000.01957.01957.00.07586620122000.01689.01689.0-0.13694420133000.0NaNNaN0.000000
# 假设增长率不变,计算明年的顾客量 (1 + Year.ix[2012,'YR_PCT_CHANGE']) * Year.loc[2012,'Max'] 12
/home/dreamboy/anaconda3/lib/python3.6/site-packages/ipykernel_launcher.py:2: DeprecationWarning: .ix is deprecated. Please use .loc for label based indexing or .iloc for positional indexing See the documentation here: http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated 1457.7010730710272 1234567891011121314
# 绘图 # 总的数据,即把state相加得到的结果 ALL['Max'].plot(figsize=(10, 5));plt.title('ALL Markets') # 各个市场,用子图 fig, axes = plt.subplots(nrows=2, ncols=2, figsize=(20, 10)) fig.subplots_adjust(hspace=1.0) # 在子图之间开出空白空间 Daily.loc['FL']['CustomerCount']['2012':].fillna(method='pad').plot(ax=axes[0,0]) Daily.loc['GA']['CustomerCount']['2012':].fillna(method='pad').plot(ax=axes[0,1]) Daily.loc['TX']['CustomerCount']['2012':].fillna(method='pad').plot(ax=axes[1,0]) Daily.loc['NY']['CustomerCount']['2012':].fillna(method='pad').plot(ax=axes[1,1]) # 增加标题 axes[0,0].set_title('Florida') axes[0,1].set_title('Georgia') axes[1,0].set_title('Texas') axes[1,1].set_title('North East');
123456789101112131415161718
相关知识
如何在Python中使用Pandas库进行季节性调整?
使用pandas绘制两列csv的直方图
【Python】—— pandas 数据分析
《中国传统插花系列教程
python数据分析pandas画图
【绿化教程系列】单文件软件制作教程
pandas读写csv文件,及注意事项
中国传统插花系列教程:研习高级
中国传统插花系列教程
机器学习系列
网址: Pandas系列教程(三) https://m.huajiangbk.com/newsview1163583.html
上一篇: 遗传算法Python 教程(1) |
下一篇: Python3编程指南 |