# -*- coding: utf-8 -*-
# @Time : 2022/1/11 13:36
import os
import requests
from pathlib import Path
import time
def mkdir(path):
folder = os.path.exists(path)
if not folder: # 判断是否存在文件夹如果不存在则创建为文件夹
os.makedirs(path) # makedirs 创建文件时如果路径不存在会创建这个路径
def get_excel(dir_paths):
"""
pdf文件所在目录
:param dir_paths:
:return:
"""
url = "http://192.168.1.31:56782/table_extract/"
# 获得所有文件
dir_path = Path(dir_paths)
file_paths = list(dir_path.glob('*.*'))
file_paths = [file_path for file_path in file_paths if file_path.suffix.lower() in ['.pdf']]
# 循环调用接口
for file_path_index, file_path in enumerate(file_paths):
print("第{}份文件开始==========================================".format(file_path_index + 1))
print(' [{0} / {1}] 服务开始 {2}...'.format(file_path_index + 1, len(file_paths), file_path))
start_time = time.time()
# file_path = '/Users/jiongjiongai/data/alpha_insight/ocr/kp/ganggu/港股繁体报告/2021123000355_c.pdf'
file_path = Path(file_path)
xlsx_file_path = file_path.with_suffix('.xlsx')
xlsx_name = xlsx_file_path.name
# xlsx_file_dir = r"{}\excel_result".format(dir_path)
xlsx_file_dir = dir_path.joinpath('excel_result')
# xlsx_file = xlsx_file_dir.joinpath(xlsx_name)
# 新建文件夹
mkdir(xlsx_file_dir)
payload = {}
files = [
('file', (file_path.name, open(str(file_path), 'rb'), 'application/pdf'))
]
headers = {'Connection': 'close'}
# with requests.Session() as session:
session = requests.session()
response = session.post(url, headers=headers, data=payload, files=files, timeout=600)
requests.session().close()
# 新建文件
# xlsx_file = r"{}\excel_result\{}".format(dir_path, xlsx_name)
xlsx_file = xlsx_file_dir.joinpath(xlsx_name)
with open(str(xlsx_file), 'wb') as f:
f.write(response.content)
duration_sec = time.time() - start_time
print(' [{0} / {1}] 服务结束 {2} with duration: {3} minutes.'.format(file_path_index + 1, len(file_paths), file_path, int(duration_sec / 60)))
print(" 第{}份文件结束!!!用时{}min".format(file_path_index + 1, int(duration_sec / 60)))
if __name__ == '__main__':
star_time = time.time()
get_excel(r'F:\财富趋势')
end_time = time.time()
print("总计用时:{}min".format(round(int(end_time - star_time) / 60, 2)))
url没写对,少个/,记录一下,搞死了。。。。操