现实遇到的问题
在测试导入excel数据时,需要自己造数据,如果写入大批量数据不可能自己一个个造,所以写了一个写入数据到excel的demo
下面为写入excel的demo
import os
import random
from faker import Faker
file_path = os.getcwd()
print(file_path)
f = Faker(locale=‘en_US‘)
class GenExcelUser(object):
def __init__(self, num=10, type=1, scenario=0):
"""
:param num: 生成手机号数量
:type 1为导入用户,excel中有三个字段:买家账号,买家手机号,用户标记;2为黑名单用户,只有手机号一个字段
:scenario 只针对导入用户生效,默认为正常数据,即买家账号,手机号,标记均存在
1:最后一条无手机号;2:最后一条无昵称:3:全无手机号;4:全无昵称;5:昵称手机号均为空
"""
self.dirname = ‘\导入测试数据.xlsx‘
self.buername = ‘autotest‘
self.num = num
self.type = type
self.lable = ‘50w_mark‘ # 标签名
self.scenario = scenario
def gen_phone(self):
a = random.sample(range(0, 10), 8)
ll = [str(i) for i in a]
res = (‘179‘ + ‘‘.join(ll))
return res
def generate_random_str(self):
"""
:return: 随机字符串 返回3位字符串
"""
import string
# a-z小写列表
lower_str = list(string.ascii_lowercase)
# a-z大写列表
up_str = list(string.ascii_uppercase)
# 0-9的列表,列表对应的为字符串
int_str = [str(i) for i in range(10)]
# 两个列表合并,在小写列表中做拓展
lower_str.extend(up_str)
lower_str.extend(int_str)
# 在列表中随机选取16个元素
ff = random.sample(lower_str, 7)
return ‘‘.join(ff)
def gene_data(self):
"""
导入用户
"""
buername = self.buername + ‘_‘ + str(self.generate_random_str())
phone = self.gen_phone()
lable = self.lable # 这边定义写死后,后面不管这么取,都为默认值
return locals()
# 弃用 该方法,该方法写入十万级别数据没问题,百万级别就会报错
# def main(self, num=100):
# import pandas as pd
# from pandas import DataFrame
#
# list_name = [self.gene_data()[‘buername‘] for i in range(num)]
# list_phone = [self.gene_data()[‘phone‘] for i in range(num)]
# list_lable = [self.gene_data()[‘lable‘] for i in range(num)]
# # 写
# dic1 = {‘账号‘: list_name,
# ‘手机号‘: list_phone,
# ‘标记‘: list_lable
# }
# df = pd.DataFrame(dic1)
# df.to_excel(self.dirname, index=False)
# print(‘写入完成‘)
def xw_toExcel(self, num=100): # xlsxwriter库储存数据到excel
import xlsxwriter as xw
fileName = file_path+self.dirname
workbook = xw.Workbook(fileName) # 创建工作簿
worksheet1 = workbook.add_worksheet("sheet1") # 创建子表
worksheet1.activate() # 激活表
title = [‘账号‘, ‘手机号‘, ‘标记‘] # 设置表头
worksheet1.write_row(‘A1‘, title) # 从A1单元格开始写入表头
i = 2 # 从第二行开始写入数据
data = [self.gene_data() for i in range(num)]
for j in range(len(data)):
insertData = [data[j]["buername"], data[j]["phone"], data[j]["lable"]]
row = ‘A‘ + str(i)
worksheet1.write_row(row, insertData)
i += 1
workbook.close() # 关闭表
if __name__ == ‘__main__‘:
import time
num = input(‘请输入需要导入数量: ‘)
start_time = time.time()
cl = GenExcelUser()
cl.xw_toExcel(num=int(num))
end_time = time.time()
print(‘总计耗时%s‘ % (end_time - start_time))
由于写入数据是随机的,无法保证数据是否有重复值,所以就写了一个解析excel文件,查看是否有重复数据
代码如下
import pandas as pd
import time
file_path = r‘./导入测试数据.xlsx‘
file_path1 = r‘./测试.xlsx‘
def excel_one_line_to_list():
"""
读取excel文件,读取第一列数据
"""
df = pd.read_excel(file_path, usecols=[0],
names=None) # 读取项目名称列,不要列名,读取两列则usecols=[0,1],对应列
df_li = df.values.tolist()
result = []
for s_li in df_li:
result.append(s_li[0])
print(‘列表数量为%d‘ % (len(result)))
return result
def compar_data(data: list):
"""
判断是否有重复数据
"""
num = 1
dic1 = {}
train_data = [i for i in data]
for i in train_data:
if i not in dic1.keys():
dic1[i] = num
else:
dic1[i] += 1
ll = []
for k, v in dic1.items():
if v == 2 or v > 2:
ll.append(k)
return ll
if __name__ == ‘__main__‘:
start_time = time.time()
train_data = excel_one_line_to_list()
ss = compar_data(train_data)
print(‘重复数量为%s,重复值列表为%s‘ % (str(len(ss)),str(ss)))
end_time = time.time()
print(‘总计耗时%s‘ % (end_time - start_time))