import pandas as pd
from datetime import datetime
#导入股票数据
df = pd.read_excel('p:\data\沪深股票股本变动数据.xlsx',
sheet_name='sharedata')
#查看df中每个字段数据类型,发现df['日期']是int64
print(df.info())
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 65865 entries, 0 to 65864
Data columns (total 15 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 市场类型 65865 non-null object
1 股票代码 65865 non-null object
2 日期 65865 non-null int64
3 变动类型 65865 non-null object
4 每10股票分红(元) 26897 non-null float64
5 配股价(元) 26897 non-null float64
6 每10股票送几股 26897 non-null float64
7 每10股票配几股 26897 non-null float64
8 前流通盘 38936 non-null float64
9 后流通盘 38936 non-null float64
10 前总股本 38936 non-null float64
11 后总股本 38936 non-null float64
12 浓缩比例 8 non-null float64
13 份数 24 non-null float64
14 行权价 24 non-null float64
dtypes: float64(11), int64(1), object(3)
memory usage: 7.5+ MB
None
#将int转换成str
df_date = df['日期'].apply(str)
#用to_datetime()函数将字符串转换成时间格式,并增加'时间'字段
df['时间'] = pd.to_datetime(df_date,format='%Y/%m/%d')
print(df['时间'])
0 1990-03-01
1 1990-09-27
2 1990-09-27
3 1991-02-26
4 1991-03-12
...
65860 2016-12-31
65861 2016-12-31
65862 2016-12-31
65863 2016-12-31
65864 2016-12-31
Name: 时间, Length: 65865, dtype: datetime64[ns]
#将日期格式化,并增加'格式化日期'字段
df['格式化日期1'] = df.时间.apply(lambda x: datetime.
strftime(x, format='%Y-%m-%d'))
df['格式化日期2'] = df.时间.dt.strftime('%Y-%m-%d')
print(df['格式化日期1'],'\n',df['格式化日期2'])
0 1990-03-01
1 1990-09-27
2 1990-09-27
3 1991-02-26
4 1991-03-12
...
65860 2016-12-31
65861 2016-12-31
65862 2016-12-31
65863 2016-12-31
65864 2016-12-31
Name: 格式化日期1, Length: 65865, dtype: object
0 1990-03-01
1 1990-09-27
2 1990-09-27
3 1991-02-26
4 1991-03-12
...
65860 2016-12-31
65861 2016-12-31
65862 2016-12-31
65863 2016-12-31
65864 2016-12-31
Name: 格式化日期2, Length: 65865, dtype: object
#抽取'时间'字段中的值
df['时间.年'] = df['时间'].dt.year
df['时间.月'] = df['时间'].dt.month
df['时间.周'] = df['时间'].dt.weekday
df['时间.日'] = df['时间'].dt.day
df['时间.时'] = df['时间'].dt.hour
df['时间.分'] = df['时间'].dt.minute
df['时间.秒'] = df['时间'].dt.second
print(df['时间.年'],df['时间.月'],df['时间.周'],df['时间.日'])
print(df['时间.时'],df['时间.分'],df['时间.秒'])
0 1990
1 1990
2 1990
3 1991
4 1991
...
65860 2016
65861 2016
65862 2016
65863 2016
65864 2016
Name: 时间.年, Length: 65865, dtype: int64 0 3
1 9
2 9
3 2
4 3
..
65860 12
65861 12
65862 12
65863 12
65864 12
Name: 时间.月, Length: 65865, dtype: int64 0 3
1 3
2 3
3 1
4 1
..
65860 5
65861 5
65862 5
65863 5
65864 5
Name: 时间.周, Length: 65865, dtype: int64 0 1
1 27
2 27
3 26
4 12
..
65860 31
65861 31
65862 31
65863 31
65864 31
Name: 时间.日, Length: 65865, dtype: int64
0 0
1 0
2 0
3 0
4 0
..
65860 0
65861 0
65862 0
65863 0
65864 0
Name: 时间.时, Length: 65865, dtype: int64 0 0
1 0
2 0
3 0
4 0
..
65860 0
65861 0
65862 0
65863 0
65864 0
Name: 时间.分, Length: 65865, dtype: int64 0 0
1 0
2 0
3 0
4 0
..
65860 0
65861 0
65862 0
65863 0
65864 0
Name: 时间.秒, Length: 65865, dtype: int64