File Sharding Pressure Test#
由于我们所有的 dataset index 都放在一个目录下, 最终可能会生成非常多文件. 于是我们想知道在单个文件夹内文件数量很多的情况, 是否会影响到我们的性能.
根据下面的测试结果, 结论是在 10w 个文件的情况下, 对于已知文件路径的寻址速度几乎没有什么影响.
1# -*- coding: utf-8 -*-
2
3"""
4Environment: MacOS M1 Pro 32G RAM
5
6Result:
7
8- Reset dir_temp: from 2023-09-25 02:48:06.710175 to 2023-09-25 02:48:16.270893 elapsed 9.560718 second.
9- cpu count: 10
10- first: 736d81ffd6f549f3b9e78812c6d5b7f6.txt
11- Create many files: from 2023-09-25 02:48:16.271288 to 2023-09-25 02:48:22.483035 elapsed 6.211747 second.
12- last: 5b12dd8262784d8eab6001f4c069f708.txt
13- Count files: from 2023-09-25 02:48:22.484286 to 2023-09-25 02:48:22.967109 elapsed 0.482823 second.
14- n files: 100002
15- read first file: from 2023-09-25 02:48:22.967166 to 2023-09-25 02:48:22.982786 elapsed 0.015620 second.
16- read last file: from 2023-09-25 02:48:22.982826 to 2023-09-25 02:48:22.998569 elapsed 0.015743 second.
17"""
18
19import os
20import shutil
21import uuid
22from pathlib import Path
23from fixa.timer import DateTimeTimer
24from mpire import WorkerPool
25
26dir_temp = Path.home().joinpath("tmp")
27
28
29def reset_dir_temp():
30 """
31 清空临时文件夹中的所有文件. 用于模拟清空一个里面有巨多小文件的文件夹的情况.
32 """
33 with DateTimeTimer("Reset dir_temp"):
34 shutil.rmtree(dir_temp, ignore_errors=True)
35 dir_temp.mkdir(parents=True, exist_ok=True)
36
37
38def _create_many_files(n: int):
39 for _ in range(n):
40 p = dir_temp / f"{uuid.uuid4().hex}.txt"
41 p.write_text("hello world")
42
43
44def create_many_files():
45 """
46 创建超级多的小文件, 用于模拟一个文件夹中有巨多小文件的情况.
47
48 5.626452 second.
49 """
50 n_batch = 10
51 n_file_per_batch = 10000
52 args = [dict(n=n_file_per_batch) for _ in range(n_batch)]
53
54 print(f"cpu count: {os.cpu_count()}")
55
56 first_id = uuid.uuid4().hex
57 p = dir_temp / f"{first_id}.txt"
58 p.write_text("hello world")
59 print(f"first: {p.name}")
60
61 with DateTimeTimer("Create many files"):
62 with WorkerPool() as pool:
63 pool.map(_create_many_files, args)
64
65 last_id = uuid.uuid4().hex
66 p = dir_temp / f"{last_id}.txt"
67 p.write_text("hello world")
68 print(f"last: {p.name}")
69
70 return first_id, last_id
71
72
73def count_files():
74 """
75 统计文件夹中有多少个文件.
76
77 0.393739 second.
78 """
79 with DateTimeTimer("Count files"):
80 for i, _ in enumerate(dir_temp.glob("**/*"), start=1):
81 pass
82 print(f"n files: {i}")
83
84
85def test_read_first_and_last_file(first_id, last_id):
86 """
87 测试在已知文件夹中第一个和最后一个文件的情况下, 读取这两个文件的时间是否有差异, 平均速度多少.
88
89 from 2023-09-25 02:41:23.647906 to 2023-09-25 02:41:23.659767 elapsed 0.011861 second.
90 from 2023-09-25 02:41:23.659789 to 2023-09-25 02:41:23.671414 elapsed 0.011625 second.
91 """
92 n_times = 1000
93 with DateTimeTimer("read first file"):
94 for _ in range(n_times):
95 p = dir_temp / f"{first_id}.txt"
96 p.read_text()
97
98 with DateTimeTimer("read last file"):
99 for _ in range(n_times):
100 p = dir_temp / f"{last_id}.txt"
101 p.read_text()
102
103
104if __name__ == "__main__":
105 reset_dir_temp()
106 first_id, last_id = create_many_files()
107 count_files()
108 test_read_first_and_last_file(first_id, last_id)