大文件切割Demo Python

CSV切割demo:实现了按行数切割以及按文件书切割

#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Time    : 2021/4/2 12:17 下午
# @Author  : Xinlong Chen
# @File    : test.py

import math
import os


class CsvSplit:
    def __mkSubFile(self, lines, dir, head, srcName, sub):
        [des_filename, extname] = os.path.splitext(srcName)
        filename = dir + os.sep + des_filename + '_' + str(sub) + extname
        print('make file: %s' % filename)
        fout = open(filename, 'w')
        try:
            fout.writelines([head])
            fout.writelines(lines)
            return sub + 1
        finally:
            fout.close()

    def __readFromCsv(self, filename):
        if os.path.exists(filename):
            with open(filename, 'r') as file:
                head = file.readline()
                lines = file.readlines()
            return head, lines, 0
        else:
            print("error filename")
            return [], [], -1

    def splitByLineCount(self, filename, dir, count: int, lines=[], head=""):
        if len(lines) == 0:
            head, lines, status = self.__readFromCsv(filename)
            if status != 0:
                return

        if not os.path.exists(dir):
            os.makedirs(dir)

        buf = []
        sub = 1
        for line in lines:
            buf.append(line)
            if len(buf) == count:
                sub = self.__mkSubFile(buf, dir, head, filename, sub)
                buf = []
        # judge end
        if len(buf) != 0:
            self.__mkSubFile(buf, dir, head, filename, sub)

    def splitByFileCount(self, filename, dir, filecount=10):
        head, lines, status = self.__readFromCsv(filename)
        if status != 0:
            return
        all_length = len(lines)
        each_file = math.ceil(all_length / filecount)
        self.splitByLineCount(filename, dir, each_file, lines, head)


if __name__ == '__main__':
    CsvSplit().splitByFileCount('weibo.csv', dir='weibo', filecount=10)
    CsvSplit().splitByLineCount('weibo.csv', dir='weibo1', count=500000)

上一篇:03 Spark RDD编程基础


下一篇:SpringMVC + ehcache( ehcache-spring-annotations)基于注解的服务器端数据缓存