AWSメモ > S3へのファイル登録のパフォーマンス比較 †
概要 †S3へのファイル登録時に使用するAPIによってパフォーマンスがどれだけ変わるか検証する テストの基本的な方針は以下の通り。 結果(平均) †結果からみるとさすがに大きな違いはなかったが、やはり圧縮して扱った方が速い模様。
時間をおいて、client.upload_fileobj のファイル圧縮版を加えて計測してみた。
テストスクリプト †テスト処理の雛形 †s3_put_test_base.py import boto3 from datetime import datetime import decimal import json import random import time import sys import hashlib class S3PutTestBase: BUCKET_NAME = 'test-s3-put-and-get' FILE_DIR_PATH = 'dir1/dir2/dir3' FILE_PUT_COUNT = 10 RESULT_FILE = sys.argv[1] TIMES = sys.argv[2] time_info = [] @classmethod def main(cls): cls.print_title() stime = time.time() for i in range(cls.FILE_PUT_COUNT): file_name = f'test{i+1:04d}.json' file_key = f'{cls.FILE_DIR_PATH}/{file_name}' file_data = cls.get_file_data(i+1) cls.upload(cls.BUCKET_NAME, cls.add_prefix(file_key), file_data) etime = time.time() cls.time_info.append(etime - stime) with open(cls.RESULT_FILE, 'a') as f: line = f'{cls.TIMES}回目\t' + '\t'.join(list(map(lambda x: str(round(x, 3)), cls.time_info))) f.write(f'{line}\n') print(f'{line}') @classmethod def add_prefix(cls, file_key): return file_key #file_prefix = hashlib.md5(bytes(file_key,'utf-8')).hexdigest()[0:4] #return f'{file_prefix}-{file_key}' @classmethod def print_title(cls): res = '' try: with open(cls.RESULT_FILE) as f: res = f.read() except: pass if not res: with open(cls.RESULT_FILE, 'a') as f: line = 'N回目\t' + '\t'.join([ f'ファイル{i+1}' for i in range(cls.FILE_PUT_COUNT)]) + '\t合計' f.write(f'{line}\n') if sys.argv[2] == '1': line = 'N回目\t' + '\t'.join([ f'ファイル{i+1}' for i in range(cls.FILE_PUT_COUNT)]) + '\t合計' print(line) @classmethod def get_file_data(cls,file_no): data = { 'var1': 'abcdefg', 'var2': 'xyz1234', 'data': [{f'key{i+1:08d}': random.randrange(1,100) } for i in range(file_no*100)] } data_bytes = bytes(json.dumps(data, default=decimal.Decimal), 'utf-8') etime = time.time() return data_bytes boto3.resource.Object.put を使用 †s3_put_test1.py import boto3 from datetime import datetime import decimal import json import random import time import sys from s3_put_test_base import S3PutTestBase class S3PutTest1(S3PutTestBase): """ boto3.resource.Object.put を使用 """ s3 = boto3.resource('s3') @classmethod def upload(cls, bucket_name, file_key, file_bytes): stime = time.time() s3_obj = cls.s3.Object(bucket_name, file_key) result = s3_obj.put(Body = file_bytes) etime = time.time() cls.time_info.append(etime - stime) return result if __name__ == '__main__': S3PutTest1.main() boto3.resource.Bucket の put_object を使用 †s3_put_test2.py import boto3 from datetime import datetime import decimal import json import random import time from s3_put_test_base import S3PutTestBase class S3PutTest2(S3PutTestBase): """ boto3.resource.Bucket.put_object を使用 """ s3_bucket = boto3.resource('s3').Bucket(S3PutTestBase.BUCKET_NAME) @classmethod def upload(cls, bucket_name, file_key, file_bytes): stime = time.time() s3_object = cls.s3_bucket.put_object( Key=file_key, Body=file_bytes ) etime = time.time() cls.time_info.append(etime - stime) return s3_object if __name__ == '__main__': S3PutTest2.main() boto3.client.put_object を使用 †s3_put_test3.py import boto3 from datetime import datetime import decimal import json import random import time import sys from s3_put_test_base import S3PutTestBase class S3PutTest3(S3PutTestBase): """ boto3.client.put_object を使用 """ s3_client = boto3.client('s3') @classmethod def upload(cls, bucket_name, file_key, file_bytes): stime = time.time() result = cls.s3_client.put_object( Bucket=bucket_name, Key=file_key, Body=file_bytes ) etime = time.time() cls.time_info.append(etime - stime) return result if __name__ == '__main__': S3PutTest3.main() boto3.resource.meta.client.upload_file を使用 †s3_put_test4.py import boto3 from datetime import datetime import decimal import json import random import time import sys import uuid import os from s3_put_test_base import S3PutTestBase class S3PutTest4(S3PutTestBase): """ boto3.resource.meta の client.upload_file を使用 """ s3_meta_client = boto3.resource('s3').meta.client @classmethod def upload(cls, bucket_name, file_key, file_bytes): stime = time.time() tmp_file_name = '/tmp/' + str(uuid.uuid1()) + '.json' with open(tmp_file_name, 'wb') as f: f.write(file_bytes) result = s3_meta_client.upload_file(tmp_file_name, bucket_name, file_key) os.remove(tmp_file_name) etime = time.time() cls.time_info.append(etime - stime) return result if __name__ == '__main__': S3PutTest4.main() boto3.resource.Bucket.put_object を使用(かつファイル圧縮) †s3_put_test5.py import boto3 from datetime import datetime import decimal import gzip import json import random import time from s3_put_test_base import S3PutTestBase class S3PutTest5(S3PutTestBase): """ s3_bucket.put_object を使用(かつファイル圧縮) """ s3_bucket = boto3.resource('s3').Bucket(S3PutTestBase.BUCKET_NAME) @classmethod def upload(cls, bucket_name, file_key, file_bytes): stime = time.time() file_bytes = gzip.compress(file_bytes) s3_object = cls.s3_bucket.put_object( Key=file_key, Body=file_bytes ) etime = time.time() cls.time_info.append(etime - stime) return s3_object if __name__ == '__main__': S3PutTest5.main() boto3.client.put_object で 属性を指定してアップロード †s3_put_test6.py import boto3 from datetime import datetime import decimal import json import random import time import sys from s3_put_test_base import S3PutTestBase class S3PutTest6(S3PutTestBase): """ boto3.client.put_object で 属性を指定してアップロード """ s3_client = boto3.client('s3') @classmethod def upload(cls, bucket_name, file_key, file_bytes): stime = time.time() result = cls.s3_client.put_object( ACL='private', Bucket=bucket_name, Key=file_key, Body=file_bytes, ContentType='application/json', ContentLength=len(file_bytes) ) etime = time.time() cls.time_info.append(etime - stime) return result if __name__ == '__main__': S3PutTest6.main() boto3.resource.Bucket.put_object でファイル圧縮しつつ、属性を指定 †s3_put_test7.py import boto3 from datetime import datetime import decimal import json import random import time import gzip from s3_put_test_base import S3PutTestBase class S3PutTest7(S3PutTestBase): """ boto3.resource.Bucket.put_object でファイル圧縮しつつ、属性を指定 """ s3_bucket = boto3.resource('s3').Bucket(S3PutTestBase.BUCKET_NAME) @classmethod def upload(cls, bucket_name, file_key, file_bytes): stime = time.time() file_bytes = gzip.compress(file_bytes) s3_object = cls.s3_bucket.put_object( ACL='private', Key=file_key, Body=file_bytes, ContentType='application/gzip', ContentLength=len(file_bytes) ) etime = time.time() cls.time_info.append(etime - stime) return s3_object if __name__ == '__main__': S3PutTest7.main() boto3.resource.meta.client.upload_fileobj を使用 †s3_put_test8.py import boto3 from datetime import datetime import decimal import json import random import time import sys import uuid import gzip import os from s3_put_test_base import S3PutTestBase class S3PutTest8(S3PutTestBase): """ boto3.resource.meta.client.upload_fileobj を使用 """ s3_meta_client = boto3.resource('s3').meta.client @classmethod def upload(cls, bucket_name, file_key, file_bytes): stime = time.time() file_bytes = gzip.compress(file_bytes) tmp_file_name = '/tmp/' + str(uuid.uuid1()) + '.gz' with open(tmp_file_name, 'wb') as f: f.write(file_bytes) with open(tmp_file_name, 'rb') as f: result = cls.s3_meta_client.upload_fileobj(f, bucket_name, file_key) os.remove(tmp_file_name) etime = time.time() cls.time_info.append(etime - stime) return result if __name__ == '__main__': S3PutTest8.main() 結果集計用スクリプト †print_average.py import sys def main(resulf_file): num_min = [] num_max = [] num_info = [] line_count = 0 with open(resulf_file) as f: result = f.read() # print(result) lines = result.split('\n') for i,line in enumerate(lines): if i == 0: continue cols = line.split('\t') line_count += 1 for j,col in enumerate(cols): if not col or j == 0: continue if i == 1: num_info.append([]) num_min.append(999.0) num_max.append(0.0) num = float(col) num_info[j-1].append(num) if num_max[j-1] < num: num_max[j-1] = num if num_min[j-1] > num: num_min[j-1] = num averages = [] for i,nums in enumerate(num_info): min_val = num_min[i] max_val = num_max[i] summary = 0.0 col_count = 0 for j,num in enumerate(nums): if min_val == num or max_val == num: continue else: col_count += 1 summary = summary + num #print(f'{i}/{j} : {min_val} : {max_val} : {num}') average = summary / col_count averages.append(average) print(f'平均({resulf_file})\t' + '\t'.join(list(map(lambda x: str(round(x, 3)), averages)))) if __name__ == '__main__': resulf_file = sys.argv[1] main(resulf_file) 実行用シェル †s3-put-test.sh #!/bin/bash for i in `seq 1 8` do file_name="s3_put_test${i}.py" result_file="result${i}.csv" rm -rf $result_file for j in `seq 1 10` do python3 $file_name $result_file $j done python3 print_average.py $result_file done 上記をさらに5回繰り返すシェル #!/bin/bash for no in `seq 1 5` do ./s3-put-test.sh | tee summaryA-${no}.csv done 結果 †cat summaryA* | grep 平均 | sort
|