admin 管理员组

文章数量: 887016

  • 统计指定目录子文件类型和占用大小分布
# @Time    : 2021/12/16 15:11
# @Author  : wyh
# @FileName: file_size.py
# @Software: PyCharm

import os
import datetime
"""
统计指定目录子文件类型和占用大小分布
"""

type_set = set()
type_size = {0: 0, 1: 0, 2: 0, 3: 0, 4: 0, 5: 0, 6: 0, 7: 0, 8: 0, 9: 0, 10: 0, 11: 0, 12: 0, 13: 0}

KB = 1024
MB = 1048576
GB = 1073741824
TB = 1099511627776


def get_size_type(path):
    files = os.listdir(path)
    for filename in files:
        temp_path = os.path.join(path, filename)
        if os.path.isdir(temp_path):
            get_size_type(temp_path)  # 递归
        elif os.path.isfile(temp_path):
            type_name = os.path.splitext(temp_path)[1]
            file_size = os.path.getsize(temp_path)
            # print(temp_path)
            # print(file_size)
            if file_size in range(KB, MB):
                type_size[1] = type_size[1] + 1
            elif file_size in range(1 * MB, 5 * MB):
                type_size[2] = type_size[2] + 1
            elif file_size in range(5 * MB, 15 * MB):
                type_size[3] = type_size[3] + 1
            elif file_size in range(15 * MB, 50 * MB):
                type_size[4] = type_size[4] + 1
            elif file_size in range(50 * MB, 100 * MB):
                type_size[5] = type_size[5] + 1
            elif file_size in range(100 * MB, 300 * MB):
                type_size[6] = type_size[6] + 1
            elif file_size in range(300 * MB, 700 * MB):
                type_size[7] = type_size[7] + 1
            elif file_size in range(700 * MB, GB):
                type_size[8] = type_size[8] + 1
            elif file_size in range(GB, 2 * GB):
                type_size[9] = type_size[9] + 1
            elif file_size in range(2 * GB, 5 * GB):
                type_size[10] = type_size[10] + 1
            elif file_size in range(5 * GB, 10 * GB):
                type_size[11] = type_size[11] + 1
            elif file_size in range(10 * GB, 100 * GB):
                type_size[12] = type_size[12] + 1
            elif file_size in range(0, KB):
                type_size[0] = type_size[0] + 1
            else:
                type_size[13] = type_size[13] + 1
            type_set.add(type_name)


start_time = datetime.datetime.now()
path = r"D:\PythonProject"  # 需要统计的根目录
get_size_type(path)

# 文件类型数
print(len(type_set))
# 文件类型列表
print(type_set)
# 具体Size分布
print(type_size)
end_time = datetime.datetime.now()
# print(end_time - start_time)

在本地测试 23GB文件 需要运行 2.6秒左右

把脚本丢给运维的同学用去之后发现了一个问题,他们是要处理大小达到10TB级别的文件,有一个问题文件目录层数很深,会报一个错误如下:
FilNotFoundError: 系统找不到指定的路径
解决方案: 在绝对路径的前面加上\\?\

# @Time    : 2021/12/16 15:11
# @Author  : wyh
# @FileName: file_size.py
# @Software: PyCharm

import os
import datetime
"""
统计指定目录子文件类型和占用大小分布
"""

size_dict = {}
type_dict = {}
type_set = set()
type_size = {0: 0, 1: 0, 2: 0, 3: 0, 4: 0, 5: 0, 6: 0, 7: 0, 8: 0, 9: 0, 10: 0, 11: 0, 12: 0, 13: 0}

KB = 1024
MB = 1048576
GB = 1073741824
TB = 1099511627776


def get_size_type(path):
    files = os.listdir('\\\?\\' + path)
    for filename in files:
        temp_path = os.path.join(path, filename)
        if os.path.isdir(temp_path):
            get_size_type(temp_path)  # 递归
        elif os.path.isfile(temp_path):
            type_name = os.path.splitext(temp_path)[1]
            file_size = os.path.getsize(temp_path)
            # print(temp_path)
            # print(file_size)
            if not type_name:
                type_dict.setdefault("None", 0)
                type_dict["None"] += 1
                size_dict.setdefault("None", 0)
                size_dict["None"] += os.path.getsize(temp_path)
            else:
                type_dict.setdefault(type_name, 0)
                type_dict[type_name] += 1
                size_dict.setdefault(type_name, 0)
                size_dict[type_name] += os.path.getsize(temp_path)  # 获取文件大小

            if file_size in range(KB, MB):
                type_size[1] = type_size[1] + 1
            elif file_size in range(1 * MB, 5 * MB):
                type_size[2] = type_size[2] + 1
            elif file_size in range(5 * MB, 15 * MB):
                type_size[3] = type_size[3] + 1
            elif file_size in range(15 * MB, 50 * MB):
                type_size[4] = type_size[4] + 1
            elif file_size in range(50 * MB, 100 * MB):
                type_size[5] = type_size[5] + 1
            elif file_size in range(100 * MB, 300 * MB):
                type_size[6] = type_size[6] + 1
            elif file_size in range(300 * MB, 700 * MB):
                type_size[7] = type_size[7] + 1
            elif file_size in range(700 * MB, GB):
                type_size[8] = type_size[8] + 1
            elif file_size in range(GB, 2 * GB):
                type_size[9] = type_size[9] + 1
            elif file_size in range(2 * GB, 5 * GB):
                type_size[10] = type_size[10] + 1
            elif file_size in range(5 * GB, 10 * GB):
                type_size[11] = type_size[11] + 1
            elif file_size in range(10 * GB, 100 * GB):
                type_size[12] = type_size[12] + 1
            elif file_size in range(0, KB):
                type_size[0] = type_size[0] + 1
            else:
                type_size[13] = type_size[13] + 1

            type_set.add(type_name)


start_time = datetime.datetime.now()
path = r"D:\PythonProject"  # 需要统计的根目录
get_size_type(path)

# 文件类型数
print(len(type_set))
# 文件类型列表
print(type_set)
# 具体Size分布
print(type_size)
end_time = datetime.datetime.now()
# print(end_time - start_time)

file_path = r"D:\Statistic.txt"
with open(file_path, 'w') as f:
    f.write("文件类型数" + str(len(type_set)))
    f.write('\n')
    f.write("文件大小分布" + str(type_size))
    f.write('\n')
    for each_type in type_dict.keys():
        f.writelines(("类型" + str(each_type), "数量" + str(type_dict[each_type]),
                      "大小" + str(size_dict[each_type] / (1024 * 1024)) + "MB"))
        f.write('\n')
        print("共有【 %s 】的文件【 %d 】个 ,占用硬盘【 %.2f 】MB" %
              (each_type, type_dict[each_type], size_dict[each_type] / (1024 * 1024)))

本文标签: 文件类型 大小 目录 Windows