数据科学——使用真实数据工作

1、从文本文件中读取

import pandas as pd
color_table = pd.io.parsers.read_table("D:\\data\Colors.txt")
print(color_table)

2、读取CSV定界的格式

import pandas as pd
titanic = pd.io.parsers.read_csv("D:\\data\Titanic.csv")
X = titanic[['age']]
print(X)
#获取数据的优化
X = titanic[['age']].values
print(X)

3、读取Excel和其他的微软办公文件

import pandas as pd
trig_values = pd.read_excel("D:\\data\Values.xlsx",'Sheet1',index_col=None,na_values=['NA'])
print(trig_values)
#更高效和方便
xls = pd.ExcelFile("D:\\data\Values.xlsx")
trig_values = xls.parse('Sheet1',index_col=None,na_values=['NA'])
print(trig_values)

4、以非结构化文件的形式来发送数据(存在问题)

from skimage.io import imread
from skimage.transform import resize
from matplotlib import pyplot as plt
import matplotlib.cm as cm
example_file = ("http://upload.wikimedia.org/wikipedia/commons/7/7d/Dog_face.png")
image = imread(example_file, as_grey=True) #有颜色的图像转换成灰阶
plt.imshow(image, cmap=cm.gray) #灰阶颜色映射
plt.show()
#图像的更多信息
print("data type:%s,shape:%s"%(type(image),image.shape))
#裁剪图像
image2 = image[5:70,0:70]
plt.imshow(image, cmap=cm.gray)
plt.show()
#缩放图像
image3 = resize(image2, (30,30), mode='nearest')
plt.imshow(image3, cmap=cm.gray)
print("data type:%s,shape:%s"%(type(image3),image3.shape))
#扁平化处理图像
image_row = image3.flatten()
print("data type:%s,shape:%s"%(type(image_row),image_row.shape))

5、访问来自Web的数据

from lxml import objectify
import pandas as pd
xml = objectify.parse(open("D:\\data\XMLData.xml"))
root = xml.getroot()
df = pd.DataFrame(columns=('Number','String','Boolean'))
for i in range(0,4):
    obj = root.getchildren()[i].getchildren()
    row = dict(zip(['Number','String','Boolean'],[obj[0].text,obj[1].text,obj[2].text]))
    row_s = pd.Series(row)
    row_s.name = i
    df = df.append(row_s)
print(df)
上一篇:C++ 函数重载


下一篇:人力资源数据分析与挖掘