+{"cells":[{"cell_type":"markdown","source":["# 데이터 경량화\n","\n","## STEP 1. 전처리 함수 정의"],"metadata":{"id":"_iwHejYHOTE7"},"id":"_iwHejYHOTE7"},{"cell_type":"code","source":["from google.colab import drive\n","drive.mount('/content/drive')"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"frdn3YYH4Uf3","executionInfo":{"status":"ok","timestamp":1707032504552,"user_tz":-540,"elapsed":28613,"user":{"displayName":"Gabe","userId":"09427754051673608323"}},"outputId":"bbc6a7d3-f633-490b-9264-f4b4843799ba"},"id":"frdn3YYH4Uf3","execution_count":1,"outputs":[{"output_type":"stream","name":"stdout","text":["Mounted at /content/drive\n"]}]},{"cell_type":"code","execution_count":2,"id":"7707539a","metadata":{"scrolled":true,"id":"7707539a","executionInfo":{"status":"ok","timestamp":1707033133671,"user_tz":-540,"elapsed":960,"user":{"displayName":"Gabe","userId":"09427754051673608323"}}},"outputs":[],"source":["from tqdm import tqdm # tqdm 라이브러리를 임포트하여 진행 상황을 시각화\n","import pandas as pd # pandas 라이브러리를 데이터 처리를 위해 임포트\n","import numpy as np # numpy 라이브러리를 수치 계산을 위해 임포트\n","import os # os 라이브러리를 운영 체제와 상호 작용을 위해 임포트\n","import zipfile\n","\n","def preprocess_file(csv_file, raw_path, raw_shuffle_data_path, divide_shuffles, label, processed_files_log):\n"," try:\n"," df = pd.read_csv(raw_path + csv_file, usecols=['drawing', 'key_id']) # 원본 CSV 파일을 읽어 DataFrame으로 변환\n"," df['y'] = label # 파일 순서에 따른 라벨을 할당\n","\n"," df['cv'] = (df.key_id // 10000) % divide_shuffles # key_id를 이용하여 데이터를 분할하기 위한 값을 계산\n","\n"," for k in range(divide_shuffles):\n"," filename = raw_shuffle_data_path + f'train_k{k}.csv' # 각 청크 파일의 이름을 정의\n"," chunk = df[df.cv == k] # 현재 'cv' 값이 k인 데이터를 선택하여 새로운 DataFrame 생성\n"," chunk = chunk.drop(['key_id', 'cv'], axis=1) # 'key_id'와 'cv' 열을 삭제\n","\n"," if not os.path.exists(filename):\n"," chunk.to_csv(filename, index=False) # 파일이 존재하지 않으면 새로 생성\n"," else:\n"," chunk.to_csv(filename, mode='a', header=False, index=False) # 파일이 이미 있으면 데이터를 추가\n","\n"," os.remove(raw_path + csv_file) # 원본 CSV 파일 삭제\n","\n"," except Exception as e:\n"," print(f\"오류 발생: {csv_file} 처리 중 - {e}\") # 오류 발생 시 메시지 출력\n","\n"," with open(processed_files_log, 'a') as log:\n"," log.write(csv_file + '\\n') # 처리된 파일 로그 기록\n"]},{"cell_type":"markdown","source":["## STEP 2. 압축 해제 및 전처리"],"metadata":{"id":"uW9jU8naOYOP"},"id":"uW9jU8naOYOP"},{"cell_type":"code","source":["def main():\n"," zip_path = '/content/drive/MyDrive/AIFFEL_QUESTs/quick_draw/quickdraw-doodle-recognition.zip'\n"," raw_path = \"./data/train_raw/\"\n"," raw_shuffle_data_path = raw_path + 'shuffle_raw_gzs/'\n"," divide_shuffles = 100\n"," processed_files_log = './data/processed_files.log' # 로그 파일 경로\n","\n"," # 디렉토리 생성\n"," if not os.path.exists(raw_shuffle_data_path):\n"," os.makedirs(raw_shuffle_data_path, exist_ok=True)\n","\n"," # 로그 파일에서 이미 처리된 파일 목록 읽기\n"," if os.path.exists(processed_files_log):\n"," with open(processed_files_log, 'r') as log:\n"," processed_files = set(log.read().splitlines())\n"," else:\n"," processed_files = set()\n","\n"," # 압축 파일 처리\n"," with zipfile.ZipFile(zip_path, 'r') as zip_ref:\n"," file_list = [file for file in zip_ref.namelist() if file.startswith('train_raw/') and file.endswith('.csv')]\n","\n"," # tqdm을 사용하여 진행 상황 시각화\n"," pbar = tqdm(total=len(file_list))\n"," for y, file in enumerate(file_list):\n"," csv_file = file.split('/')[-1]\n"," pbar.set_description(f\"Processing {csv_file}\")\n","\n"," if csv_file not in processed_files:\n"," zip_ref.extract(file, './data/')\n"," preprocess_file(csv_file, raw_path, raw_shuffle_data_path, divide_shuffles, y, processed_files_log)\n"," pbar.update(1)\n"," pbar.close()\n","\n","if __name__ == \"__main__\":\n"," main()"],"metadata":{"id":"f0PDiqyuOYX-","colab":{"base_uri":"https://localhost:8080/"},"outputId":"0533624c-68de-4e85-e5fd-03e402b9a6e8"},"id":"f0PDiqyuOYX-","execution_count":null,"outputs":[{"output_type":"stream","name":"stderr","text":["Processing cannon.csv: 17%|█▋ | 58/340 [21:55<1:58:40, 25.25s/it]"]}]},{"cell_type":"code","source":[],"metadata":{"id":"s6-w3WST4sqi"},"id":"s6-w3WST4sqi","execution_count":null,"outputs":[]}],"metadata":{"kernelspec":{"display_name":"Python 3","name":"python3"},"language_info":{"codemirror_mode":{"name":"ipython","version":3},"file_extension":".py","mimetype":"text/x-python","name":"python","nbconvert_exporter":"python","pygments_lexer":"ipython3","version":"3.9.7"},"colab":{"provenance":[],"gpuType":"V100","machine_shape":"hm"},"accelerator":"GPU"},"nbformat":4,"nbformat_minor":5}
0 commit comments