File Sharding Pressure Test#

由于我们所有的 dataset index 都放在一个目录下, 最终可能会生成非常多文件. 于是我们想知道在单个文件夹内文件数量很多的情况, 是否会影响到我们的性能.

根据下面的测试结果, 结论是在 10w 个文件的情况下, 对于已知文件路径的寻址速度几乎没有什么影响.

  1# -*- coding: utf-8 -*-
  2
  3"""
  4Environment: MacOS M1 Pro 32G RAM
  5
  6Result:
  7
  8- Reset dir_temp: from 2023-09-25 02:48:06.710175 to 2023-09-25 02:48:16.270893 elapsed 9.560718 second.
  9- cpu count: 10
 10- first: 736d81ffd6f549f3b9e78812c6d5b7f6.txt
 11- Create many files: from 2023-09-25 02:48:16.271288 to 2023-09-25 02:48:22.483035 elapsed 6.211747 second.
 12- last: 5b12dd8262784d8eab6001f4c069f708.txt
 13- Count files: from 2023-09-25 02:48:22.484286 to 2023-09-25 02:48:22.967109 elapsed 0.482823 second.
 14- n files: 100002
 15- read first file: from 2023-09-25 02:48:22.967166 to 2023-09-25 02:48:22.982786 elapsed 0.015620 second.
 16- read last file: from 2023-09-25 02:48:22.982826 to 2023-09-25 02:48:22.998569 elapsed 0.015743 second.
 17"""
 18
 19import os
 20import shutil
 21import uuid
 22from pathlib import Path
 23from fixa.timer import DateTimeTimer
 24from mpire import WorkerPool
 25
 26dir_temp = Path.home().joinpath("tmp")
 27
 28
 29def reset_dir_temp():
 30    """
 31    清空临时文件夹中的所有文件. 用于模拟清空一个里面有巨多小文件的文件夹的情况.
 32    """
 33    with DateTimeTimer("Reset dir_temp"):
 34        shutil.rmtree(dir_temp, ignore_errors=True)
 35        dir_temp.mkdir(parents=True, exist_ok=True)
 36
 37
 38def _create_many_files(n: int):
 39    for _ in range(n):
 40        p = dir_temp / f"{uuid.uuid4().hex}.txt"
 41        p.write_text("hello world")
 42
 43
 44def create_many_files():
 45    """
 46    创建超级多的小文件, 用于模拟一个文件夹中有巨多小文件的情况.
 47
 48    5.626452 second.
 49    """
 50    n_batch = 10
 51    n_file_per_batch = 10000
 52    args = [dict(n=n_file_per_batch) for _ in range(n_batch)]
 53
 54    print(f"cpu count: {os.cpu_count()}")
 55
 56    first_id = uuid.uuid4().hex
 57    p = dir_temp / f"{first_id}.txt"
 58    p.write_text("hello world")
 59    print(f"first: {p.name}")
 60
 61    with DateTimeTimer("Create many files"):
 62        with WorkerPool() as pool:
 63            pool.map(_create_many_files, args)
 64
 65    last_id = uuid.uuid4().hex
 66    p = dir_temp / f"{last_id}.txt"
 67    p.write_text("hello world")
 68    print(f"last: {p.name}")
 69
 70    return first_id, last_id
 71
 72
 73def count_files():
 74    """
 75    统计文件夹中有多少个文件.
 76
 77    0.393739 second.
 78    """
 79    with DateTimeTimer("Count files"):
 80        for i, _ in enumerate(dir_temp.glob("**/*"), start=1):
 81            pass
 82    print(f"n files: {i}")
 83
 84
 85def test_read_first_and_last_file(first_id, last_id):
 86    """
 87    测试在已知文件夹中第一个和最后一个文件的情况下, 读取这两个文件的时间是否有差异, 平均速度多少.
 88
 89    from 2023-09-25 02:41:23.647906 to 2023-09-25 02:41:23.659767 elapsed 0.011861 second.
 90    from 2023-09-25 02:41:23.659789 to 2023-09-25 02:41:23.671414 elapsed 0.011625 second.
 91    """
 92    n_times = 1000
 93    with DateTimeTimer("read first file"):
 94        for _ in range(n_times):
 95            p = dir_temp / f"{first_id}.txt"
 96            p.read_text()
 97
 98    with DateTimeTimer("read last file"):
 99        for _ in range(n_times):
100            p = dir_temp / f"{last_id}.txt"
101            p.read_text()
102
103
104if __name__ == "__main__":
105    reset_dir_temp()
106    first_id, last_id = create_many_files()
107    count_files()
108    test_read_first_and_last_file(first_id, last_id)