'''
要确定excel表格中的工作簿中工作表的数量、名称和每个工作表中行列的数量
open_workbook 模块用来读取和分析Excel文件
'''
import sys
from xlrd import open_workbook
input_file=sys.argv[1]
workbook=open_workbook(input_file)
print("Number of worksheet:",workbook.nsheets)
for worksheet in workbook.sheets():
print('Worksheet name:',worksheet.name,' number of rows in worksheet:',worksheet.nrows,' number of cols in worksheet',worksheet.ncols)
'''
处理单个工作表
读写Excel文件
worksheet.cell_value();返回数据
'''
import sys
from xlrd import open_workbook
from xlwt import Workbook
input_file=sys.argv[1]
output_file=sys.argv[2]
output_workbook=Workbook()
output_worksheet=output_workbook.add_sheet('copy1')
with open_workbook(input_file) as workbook:
worksheet=workbook.sheet_by_name('january_2013')
for row_index in range(worksheet.nrows):
for column_index in range(worksheet.ncols):
output_worksheet.write(row_index,column_index,worksheet.cell_value(row_index,column_index))
output_workbook.save(output_file)
'''
处理单个工作表
读写Excel文件-格式化数据
'''
import sys
from xlrd import open_workbook,xldate_as_tuple
from xlwt import Workbook
from datetime import date
input_file=sys.argv[1]
output_file=sys.argv[2]
output_workbook=Workbook()
output_worksheet=output_workbook.add_sheet('copy2')
with open_workbook(input_file) as workbook:
worksheet=workbook.sheet_by_name('january_2013')
for row_index in range(worksheet.nrows):
row_list_output=[]
for column_index in range(worksheet.ncols):
if worksheet.cell_type(row_index,column_index)==3:
date_cell=xldate_as_tuple(worksheet.cell_value(row_index,column_index),workbook.datemode)
date_cell=date(*date_cell[0:3]).strftime('%m-%d-%Y')
row_list_output.append(date_cell)
output_worksheet.write(row_index,column_index,date_cell)
else:
non_date_cell=worksheet.cell_value(row_index,column_index)
row_list_output.append(non_date_cell)
output_worksheet.write(row_index,column_index,non_date_cell)
output_workbook.save(output_file)
'''
筛选特定行-行中的值满足某个条件
'''
import sys
from xlrd import open_workbook,xldate_as_tuple
from xlwt import Workbook
from datetime import date
input_file=sys.argv[1]
output_file=sys.argv[2]
output_workbook=Workbook()
output_worksheet=output_workbook.add_sheet('copy3')
sales_amount_index=3
with open_workbook(input_file) as workbook:
worksheet=workbook.sheet_by_name('january_2013')
data=[]
header=worksheet.row_values(0)
data.append(header)
for row_index in range(1,worksheet.nrows):
row_list=[]
sales_amount=worksheet.cell_value(row_index,sales_amount_index)
if sales_amount>1400.0:
for column_index in range(worksheet.ncols):
cell_value=worksheet.cell_value(row_index,column_index)
cell_type=worksheet.cell_type(row_index,column_index)
if cell_type==3:
date_cell=xldate_as_tuple(cell_value,workbook.datemode)
date_cell=date(*date_cell[0:3]).strftime('%d-%m-%Y')
row_list.append(date_cell)
else:
row_list.append(cell_value)
if row_list:
data.append(row_list)
for list_index,output_list in enumerate(data):
for element_list,element in enumerate(output_list):
output_worksheet.write(list_index,element_list,element)
output_workbook.save(output_file)
'''
筛选特定行-行中的值属于某个集合
'''
import sys
from xlrd import open_workbook,xldate_as_tuple
from xlwt import Workbook
from datetime import date
input_file=sys.argv[1]
output_file=sys.argv[2]
output_workbook=Workbook()
output_worksheet=output_workbook.add_sheet('copy4')
important_dates=['01/24/2013', '01/31/2013']
purchase_date_index=4
with open_workbook(input_file) as workbook:
worksheet=workbook.sheet_by_name('january_2013')
heade=worksheet.row_values(0)
data=[]
data.append(heade)
for row_index in range(1,worksheet.nrows):
purchase_datetime=xldate_as_tuple(worksheet.cell_value(row_index,purchase_date_index),workbook.datemode)
purchase_date=date(*purchase_datetime[0:3]).strftime('%m/%d/%Y')
row_list=[]
if purchase_date in important_dates:
for column_index in range(worksheet.ncols):
cell_value=worksheet.cell_value(row_index,column_index)
cell_type=worksheet.cell_type(row_index,column_index)
if cell_type==3:
date_cell=xldate_as_tuple(cell_value,workbook.datemode)
date_cell=date(*date_cell[0:3]).strftime('%m/%d/%Y')
row_list.append(date_cell)
else:
row_list.append(cell_value)
if row_list:
data.append(row_list)
for list_index,output_list in enumerate(data):
for element_list,element in enumerate(output_list):
output_worksheet.write(list_index,element_list,element)
output_workbook.save(output_file)
'''
筛选特定行-行中的值属匹配于某个格式(正则表达式)
'''
import sys
from xlrd import open_workbook,xldate_as_tuple
from xlwt import Workbook
from datetime import date
import re
input_file=sys.argv[1]
output_file=sys.argv[2]
output_workbook=Workbook()
output_worksheet=output_workbook.add_sheet('copy5')
pattern=re.compile(r'(?P<my_pattern>^J.*)')
customer_name_column_index=1
with open_workbook(input_file) as workbook:
worksheet=workbook.sheet_by_name('january_2013')
data=[]
head=worksheet.row_values(0)
data.append(head)
for row_index in range(1,worksheet.nrows):
row_list=[]
if pattern.search(worksheet.cell_value(row_index,customer_name_column_index)):
for column_index in range(worksheet.ncols):
cell_value=worksheet.cell_value(row_index,column_index)
cell_type=worksheet.cell_type(row_index,column_index)
if cell_type==3:
date_cell=xldate_as_tuple(cell_value,workbook.datemode)
date_cell=date(*date_cell[0:3]).strftime('%d-%m-%Y')
row_list.append(date_cell)
else:
row_list.append(cell_value)
if row_list:
data.append(row_list)
for list_index,output_list in enumerate(data):
for elemnt_list,element in enumerate(output_list):
output_worksheet.write(list_index,elemnt_list,element)
output_workbook.save(output_file)
'''
选取特定的列-列索引值
'''
import sys
from xlrd import open_workbook,xldate_as_tuple
from xlwt import Workbook
from datetime import date
input_file=sys.argv[1]
output_file=sys.argv[2]
output_workbook=Workbook()
output_worksheet=output_workbook.add_sheet('copy6')
want_column=[1,4]
with open_workbook(input_file) as workbook:
worksheet=workbook.sheet_by_name('january_2013')
data=[]
for row_index in range(worksheet.nrows):
row_list=[]
for column_index in want_column:
cell_value=worksheet.cell_value(row_index,column_index)
cell_type=worksheet.cell_type(row_index,column_index)
if cell_type==3:
date_cell=xldate_as_tuple(cell_value,workbook.datemode)
date_cell=date(*date_cell[0:3]).strftime('%d-%m-%Y')
row_list.append(date_cell)
else:
row_list.append(cell_value)
if row_list:
data.append(row_list)
for list_index,output_list in enumerate(data):
for element_list,element in enumerate(output_list):
output_worksheet.write(list_index,element_list,element)
output_workbook.save(output_file)
'''
选取特定的列-列标题
'''
import sys
from xlrd import open_workbook,xldate_as_tuple
from xlwt import Workbook
from datetime import date
input_file=sys.argv[1]
putput_file=sys.argv[2]
output_workbook=Workbook()
output_worksheet=output_workbook.add_sheet('copy7')
want_columns=['Customer ID', 'Purchase Date']
with open_workbook(input_file) as workbook:
worksheet=workbook.sheet_by_name('january_2013')
data=[want_columns]
header_list=worksheet.row_values(0)
header_index_list=[]
for header_index in range(len(header_list)):
if header_list[header_index] in want_columns:
header_index_list.append(header_index)
for row_index in range(1,worksheet.nrows):
row_list=[]
for column_index in header_index_list:
cell_value=worksheet.cell_value(row_index,column_index)
cell_type=worksheet.cell_type(row_index,column_index)
if cell_type==3:
date_cell=xldate_as_tuple(cell_value,workbook.datemode)
date_cell=date(*date_cell[0:3]).strftime('%d-%m-%Y')
row_list.append(date_cell)
else:
row_list.append(cell_value)
if row_list:
data.append(row_list)
for index_list,output_list in enumerate(data):
for element_list,element in enumerate(output_list):
output_worksheet.write(index_list,element_list,element)
output_workbook.save(putput_file)
'''
在所有工作表中筛选特定行
'''
import sys
from xlrd import open_workbook,xldate_as_tuple
from xlwt import Workbook
from datetime import date
input_file=sys.argv[1]
output_file=sys.argv[2]
output_workbook=Workbook()
output_wroksheet=output_workbook.add_sheet('copy8')
sales_index=3
threshold=2000.0
first_worksheet=True
with open_workbook(input_file) as workbook:
data=[]
for worksheet in workbook.sheets():
if first_worksheet:
heade=worksheet.row_values(0)
data.append(heade)
first_worksheet=False
for row_index in range(1,worksheet.nrows):
row_list=[]
sale_amount=worksheet.cell_value(row_index,sales_index)
if sale_amount>threshold:
for column_index in range(worksheet.ncols):
cell_value = worksheet.cell_value(row_index, column_index)
cell_type=worksheet.cell_type(row_index,column_index)
if cell_type==3:
date_cell=xldate_as_tuple(cell_value,workbook.datemode)
date_cell=date(*date_cell[0:3]).strftime('%d-%m-%Y')
row_list.append(date_cell)
else:
row_list.append(cell_value)
if row_list:
data.append(row_list)
for index_list,output_list in enumerate(data):
for element_list,element in enumerate(output_list):
output_wroksheet.write(index_list,element_list,element)
output_workbook.save(output_file)
'''
在所有工作表中选定特定的列
'''
import sys
from xlrd import open_workbook,xldate_as_tuple
from xlwt import Workbook
from datetime import date
input_file=sys.argv[1]
output_file=sys.argv[2]
output_workbook=Workbook()
output_worksheet=output_workbook.add_sheet('copy9')
want_columns=['Customer Name', 'Sale Amount']
first_worksheet=True
with open_workbook(input_file) as workbook:
data=[want_columns]
index_of_cols_to_keep=[]
for worksheet in workbook.sheets():
if first_worksheet:
header=worksheet.row_values(0)
for columns_index in range(len(header)):
if header[columns_index] in want_columns:
index_of_cols_to_keep.append(columns_index)
first_worksheet=False
for row_index in range(1,worksheet.nrows):
row_list=[]
for columns_index in index_of_cols_to_keep:
cell_value=worksheet.cell_value(row_index,columns_index)
cell_type=worksheet.cell_type(row_index,columns_index)
if cell_type==3:
date_cell=xldate_as_tuple(cell_value,workbook.datemode)
date_cell=date(*date_cell[0:3]).strftime('%d-%m-%y')
row_list.append(date_cell)
else:
row_list.append(cell_value)
if row_list:
data.append(row_list)
for list_index,output_list in enumerate(data):
for element_list,element in enumerate(output_list):
output_worksheet.write(list_index,element_list,element)
output_workbook.save(output_file)
'''
在excel工作簿中读取一组工作表
在一组工作表中筛选特定行
从第一个和第二个工作表中筛选出销售额大于 $1900.00 的那些行
'''
import sys
from xlrd import open_workbook,xldate_as_tuple
from xlwt import Workbook
from datetime import date
input_file=sys.argv[1]
output_file=sys.argv[2]
output_workbook=Workbook()
output_worksheet=output_workbook.add_sheet('copy10')
threshold=1900.0
want_sheet=[0,1]
want_columns_index=3
first_worksheet=True
with open_workbook(input_file) as workbook:
data=[]
for sheet_index in range(workbook.nsheets):
if sheet_index in want_sheet:
worksheet=workbook.sheet_by_index(sheet_index)
if first_worksheet:
header_row=worksheet.row_values(0)
data.append(header_row)
first_worksheet=False
for row_index in range(1,worksheet.nrows):
row_list=[]
sale_amount=worksheet.cell_value(row_index,want_columns_index)
if sale_amount>threshold:
for column_index in range(worksheet.ncols):
cell_value=worksheet.cell_value(row_index,column_index)
cell_type=worksheet.cell_type(row_index,column_index)
if cell_type==3:
date_cell=xldate_as_tuple(cell_value,workbook.datemode)
date_cell=date(*date_cell[0:3]).strftime('%d-%m-%y')
row_list.append(date_cell)
else:
row_list.append(cell_value)
if row_list:
data.append(row_list)
for list_index ,output_list in enumerate(data):
for element_list,element in enumerate(output_list):
output_worksheet.write(list_index,element_list,element)
output_workbook.save(output_file)
'''
处理多个工作簿
工作表计数以及每个工作表中的行列计数
想知道一个文件夹中工作簿的数量,每个工作簿中工作表的数量,以及每个工作表中行与列的数量
'''
import glob
import sys
import os
from xlrd import open_workbook
input_directionary=sys.argv[1]
workbook_counter=0
for input_file in glob.glob(os.path.join(input_directionary,'*.xls*')):
workbook=open_workbook(input_file)
print('Workbook Name: %s ' % os.path.basename(input_file))
print('Number of worksheet: %d' % workbook.nsheets)
for worksheet in workbook.sheets():
print('Worksheet Name:',worksheet.name,' worksheet rows: ',worksheet.nrows,' worksheet cols:',worksheet.ncols)
workbook_counter=workbook_counter+1
print('Number of workbook: %d' %(workbook_counter))
'''
从多个工作簿中连接数据
将多个工作簿中所有工作表的数据垂直连接成一个输出文件
'''
import glob
import os
import sys
from xlrd import open_workbook,xldate_as_tuple
from xlwt import Workbook
from datetime import date
input_directionary=sys.argv[1]
output_file=sys.argv[2]
output_workbook=Workbook()
output_worksheet=output_workbook.add_sheet('copy11')
data=[]
first_worksheet=True
for input_file in glob.glob(os.path.join(input_directionary,'*.xls*')):
print('Workbook Name:',os.path.basename(input_file))
with open_workbook(input_file) as workbook:
for worksheet in workbook.sheets():
if first_worksheet:
header_row=worksheet.row_values(0)
data.append(header_row)
first_worksheet=False
for row_index in range(1,worksheet.nrows):
row_list=[]
for column_index in range(worksheet.ncols):
ceLl_value=worksheet.cell_value(row_index,column_index)
ceLl_type=worksheet.cell_type(row_index,column_index)
if ceLl_type==3:
date_cell=xldate_as_tuple(ceLl_value,workbook.datemode)
date_cell=date(*date_cell[0:3]).strftime('%d-%m-%y')
row_list.append(date_cell)
else:
row_list.append(ceLl_value)
if row_list:
data.append(row_list)
for list_index ,output_list in enumerate(data):
for element_list,element in enumerate(output_list):
output_worksheet.write(list_index,element_list,element)
output_workbook.save(output_file)
'''
为每个工作簿和工作表计算总数和均值
为多个工作簿计算工作表级别和工作簿级别的统计量
'''
import glob
import os
import sys
from xlrd import open_workbook,xldate_as_tuple
from xlwt import Workbook
input_floder=sys.argv[1]
output_file=sys.argv[2]
output_workbook=Workbook()
output_worsheet=output_workbook.add_sheet('sums_and_avergae')
all_data=[]
sales_amount_index=3
header=['workbook', 'worksheet', 'worksheet_total','worksheet_average','workbook_total','workbook_average']
all_data.append(header)
for input_file in glob.glob(os.path.join(input_floder,'*.xls*')):
with open_workbook(input_file) as workbook:
list_of_totals=[]
list_of_numbers=[]
workbook_output=[]
for worksheet in workbook.sheets():
total_sales=0
number_of_sales=0
worksheet_list=[]
worksheet_list.append(os.path.basename(input_file))
worksheet_list.append(worksheet.name)
for row_index in range(1,worksheet.nrows):
try:
total_sales=total_sales+float(str(worksheet.cell_value(row_index,sales_amount_index)).strip('$').replace(',',''))
number_of_sales=number_of_sales+1
except:
total_sales=total_sales+0
number_of_sales=number_of_sales+0
average_sales='%.2f' % (total_sales/number_of_sales)
worksheet_list.append(average_sales)
worksheet_list.append(float(average_sales))
list_of_totals.append(total_sales)
list_of_numbers.append(float(number_of_sales))
workbook_output.append(worksheet_list)
workbook_total=sum(workbook_total)
workbook_average=sum(list_of_totals)/sum(list_of_numbers)
for list_element in workbook_output:
list_element.append(worksheet_list)
list_element.append(workbook_average)
all_data.extend(workbook_output)
for list_index,output_list in enumerate(all_data):
for element_list,element in enumerate(output_list):
output_worsheet.write(list_index,element_list,element)
output_workbook.save(output_file)