-
Notifications
You must be signed in to change notification settings - Fork 3
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #46 from thetjswo/main
Data: 파이프라인 구축 중간 커밋
- Loading branch information
Showing
5 changed files
with
330 additions
and
1 deletion.
There are no files selected for viewing
Binary file not shown.
Binary file not shown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,291 @@ | ||
{ | ||
"cells": [ | ||
{ | ||
"cell_type": "markdown", | ||
"id": "bc3ebba9", | ||
"metadata": {}, | ||
"source": [ | ||
"# 서로 다른 모델과 분할된 데이터, 그 결과에 대한 병합\n", | ||
"---\n", | ||
"\n", | ||
"## 개요\n", | ||
"승순님이 만들어주신 100개의 chunk파일을 적절하게 사용하여 최소 3개의 모델을 학습시킨다. 각 모델의 성능을 병합하는 앙상블 기법을 사용하여 최종 결과를 산출한다. \n", | ||
"\n", | ||
"## 목차\n", | ||
"* 데이터 압축해제 및 확인하기\n", | ||
"* 파이프라인 구축하기\n", | ||
"* 모델 구조 설계하기\n", | ||
"* 각각의 모델 학습하기\n", | ||
"* 학습 결과 병합하기" | ||
] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"id": "3b08b963", | ||
"metadata": {}, | ||
"source": [ | ||
"### STEP 1. 데이터 압축해제 및 확인하기\n", | ||
"승순님이 만들어 놓은 청크 파일을 압축해제하고, 파일이 어떻게 구성되어 있는지 확인한다." | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 1, | ||
"id": "28027d21", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"import os\n", | ||
"import zipfile\n", | ||
"\n", | ||
"compressed_file_path = os.getenv('HOME') + '/aiffel/Kaggle_Hackathon/data/simple_shuffle_data.zip'\n", | ||
"gz_file_path = os.getenv('HOME') + '/aiffel/Kaggle_Hackathon/data/compressed_simple'" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 2, | ||
"id": "fa370735", | ||
"metadata": {}, | ||
"outputs": [ | ||
{ | ||
"ename": "KeyboardInterrupt", | ||
"evalue": "", | ||
"output_type": "error", | ||
"traceback": [ | ||
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", | ||
"\u001b[0;31mKeyboardInterrupt\u001b[0m Traceback (most recent call last)", | ||
"\u001b[0;32m/tmp/ipykernel_253/1561374210.py\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[1;32m 4\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 5\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 6\u001b[0;31m \u001b[0mextract_zip\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mcompressed_file_path\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mgz_file_path\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 7\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 8\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'All Done!'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", | ||
"\u001b[0;32m/tmp/ipykernel_253/1561374210.py\u001b[0m in \u001b[0;36mextract_zip\u001b[0;34m(current_file_path, extract_file_path)\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mextract_zip\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mcurrent_file_path\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mextract_file_path\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 2\u001b[0m \u001b[0;32mwith\u001b[0m \u001b[0mzipfile\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mZipFile\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mcurrent_file_path\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'r'\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0mzip_ref\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 3\u001b[0;31m \u001b[0mzip_ref\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mextractall\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mextract_file_path\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 4\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 5\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", | ||
"\u001b[0;32m/opt/conda/lib/python3.9/zipfile.py\u001b[0m in \u001b[0;36mextractall\u001b[0;34m(self, path, members, pwd)\u001b[0m\n\u001b[1;32m 1631\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1632\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mzipinfo\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mmembers\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1633\u001b[0;31m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_extract_member\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mzipinfo\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mpath\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mpwd\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1634\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1635\u001b[0m \u001b[0;34m@\u001b[0m\u001b[0mclassmethod\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", | ||
"\u001b[0;32m/opt/conda/lib/python3.9/zipfile.py\u001b[0m in \u001b[0;36m_extract_member\u001b[0;34m(self, member, targetpath, pwd)\u001b[0m\n\u001b[1;32m 1686\u001b[0m \u001b[0;32mwith\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mopen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mmember\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mpwd\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mpwd\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0msource\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;31m \u001b[0m\u001b[0;31m\\\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1687\u001b[0m \u001b[0mopen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtargetpath\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m\"wb\"\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0mtarget\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1688\u001b[0;31m \u001b[0mshutil\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcopyfileobj\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0msource\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtarget\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1689\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1690\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mtargetpath\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", | ||
"\u001b[0;32m/opt/conda/lib/python3.9/shutil.py\u001b[0m in \u001b[0;36mcopyfileobj\u001b[0;34m(fsrc, fdst, length)\u001b[0m\n\u001b[1;32m 203\u001b[0m \u001b[0mfdst_write\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mfdst\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mwrite\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 204\u001b[0m \u001b[0;32mwhile\u001b[0m \u001b[0;32mTrue\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 205\u001b[0;31m \u001b[0mbuf\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mfsrc_read\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mlength\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 206\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0mbuf\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 207\u001b[0m \u001b[0;32mbreak\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", | ||
"\u001b[0;32m/opt/conda/lib/python3.9/zipfile.py\u001b[0m in \u001b[0;36mread\u001b[0;34m(self, n)\u001b[0m\n\u001b[1;32m 920\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_offset\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;36m0\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 921\u001b[0m \u001b[0;32mwhile\u001b[0m \u001b[0mn\u001b[0m \u001b[0;34m>\u001b[0m \u001b[0;36m0\u001b[0m \u001b[0;32mand\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_eof\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 922\u001b[0;31m \u001b[0mdata\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_read1\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mn\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 923\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mn\u001b[0m \u001b[0;34m<\u001b[0m \u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdata\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 924\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_readbuffer\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mdata\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", | ||
"\u001b[0;32m/opt/conda/lib/python3.9/zipfile.py\u001b[0m in \u001b[0;36m_read1\u001b[0;34m(self, n)\u001b[0m\n\u001b[1;32m 988\u001b[0m \u001b[0mdata\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_decompressor\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0munconsumed_tail\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 989\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mn\u001b[0m \u001b[0;34m>\u001b[0m \u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdata\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 990\u001b[0;31m \u001b[0mdata\u001b[0m \u001b[0;34m+=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_read2\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mn\u001b[0m \u001b[0;34m-\u001b[0m \u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdata\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 991\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 992\u001b[0m \u001b[0mdata\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_read2\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mn\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", | ||
"\u001b[0;32m/opt/conda/lib/python3.9/zipfile.py\u001b[0m in \u001b[0;36m_read2\u001b[0;34m(self, n)\u001b[0m\n\u001b[1;32m 1020\u001b[0m \u001b[0mn\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mmin\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mn\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_compress_left\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1021\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1022\u001b[0;31m \u001b[0mdata\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_fileobj\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mread\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mn\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1023\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_compress_left\u001b[0m \u001b[0;34m-=\u001b[0m \u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdata\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1024\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0mdata\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", | ||
"\u001b[0;32m/opt/conda/lib/python3.9/zipfile.py\u001b[0m in \u001b[0;36mread\u001b[0;34m(self, n)\u001b[0m\n\u001b[1;32m 740\u001b[0m \"Close the writing handle before trying to read.\")\n\u001b[1;32m 741\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_file\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mseek\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_pos\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 742\u001b[0;31m \u001b[0mdata\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_file\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mread\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mn\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 743\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_pos\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_file\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtell\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 744\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mdata\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", | ||
"\u001b[0;31mKeyboardInterrupt\u001b[0m: " | ||
] | ||
} | ||
], | ||
"source": [ | ||
"def extract_zip(current_file_path, extract_file_path):\n", | ||
" with zipfile.ZipFile(current_file_path, 'r') as zip_ref:\n", | ||
" zip_ref.extractall(extract_file_path)\n", | ||
" \n", | ||
" \n", | ||
"extract_zip(compressed_file_path, gz_file_path)\n", | ||
"\n", | ||
"print('All Done!')" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"id": "ed69ff0a", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"import gzip\n", | ||
"import shutil\n", | ||
"\n", | ||
"def extract_gz(extract_file_path):\n", | ||
" for i in range(100):\n", | ||
" compressed_file_path = os.path.join(gz_file_path,f'train_k{i}.gz')\n", | ||
"# file_name = os.path.basename(compressed_file_path)\n", | ||
" extract_path = os.path.join(extract_file_path, os.path.splitext(file_name)[0])\n", | ||
"\n", | ||
" with gzip.open(compressed_file_path, 'rb') as f_in:\n", | ||
" with open(extract_path, 'wb') as f_out:\n", | ||
" shutil.copyfileobj(f_in, f_out)\n", | ||
" \n", | ||
"extract_to_path = os.getenv('HOME') + '/aiffel/Kaggle_Hackathon/data/simple'\n", | ||
"\n", | ||
"data = extract_gz(extract_to_path)\n", | ||
"\n", | ||
"print('All Done!')" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"id": "a24edcc2", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"import pandas as pd\n", | ||
"import os\n", | ||
"\n", | ||
"file_path = os.getenv('HOME') + '/aiffel/Kaggle_Hackathon/data/simple/train_k0'\n", | ||
"data_df = pd.read_csv(file_path)\n", | ||
"print(data_df.head())" | ||
] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"id": "b28424d0", | ||
"metadata": {}, | ||
"source": [ | ||
"# STEP 2. 파이프라인 구축하기\n", | ||
"압축해제된 데이터들 중 학습에 사용할 데이터를 핸들링하여 모델에 바로 입력할 수 있도록 준비한다." | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 5, | ||
"id": "94749ccb", | ||
"metadata": {}, | ||
"outputs": [ | ||
{ | ||
"ename": "SyntaxError", | ||
"evalue": "invalid syntax (2595014484.py, line 50)", | ||
"output_type": "error", | ||
"traceback": [ | ||
"\u001b[0;36m File \u001b[0;32m\"/tmp/ipykernel_253/2595014484.py\"\u001b[0;36m, line \u001b[0;32m50\u001b[0m\n\u001b[0;31m root_path = os.getenv('HOME') + '/aiffel/Kaggle_Hackathon/data'\u001b[0m\n\u001b[0m ^\u001b[0m\n\u001b[0;31mSyntaxError\u001b[0m\u001b[0;31m:\u001b[0m invalid syntax\n" | ||
] | ||
} | ||
], | ||
"source": [ | ||
"import os\n", | ||
"import pandas as pd\n", | ||
"import ast\n", | ||
"from PIL import Image, ImageDraw\n", | ||
"import matplotlib.pyplot as plt\n", | ||
"import tensorflow as tf\n", | ||
"from tensorflow.keras.preprocessing.image import img_to_array\n", | ||
"from tensorflow.keras.applications.inception_v3 import preprocess_input\n", | ||
"import pickle\n", | ||
"\n", | ||
"def data_pipeline(dir_path, pkl_path, file_idx):\n", | ||
" data_df = pd.DataFrame()\n", | ||
" \n", | ||
" data = pd.read_csv(os.path.join(dir_path, f'train_k{file_idx}'))\n", | ||
"# print(data.head())\n", | ||
" \n", | ||
"# drawing = []\n", | ||
"# for i in data['drawing']:\n", | ||
"# drawing.append(ast.literal_eval(i))\n", | ||
" \n", | ||
" for draw_idx in data['drawing']:\n", | ||
" img = Image.new('L', (256, 256), color='white')\n", | ||
" draw = ImageDraw.Draw(img)\n", | ||
"\n", | ||
" for stroke in ast.literal_eval(draw_idx):\n", | ||
" for i in range(len(stroke[0])-1):\n", | ||
" draw.line([stroke[0][i], stroke[1][i], stroke[0][i+1], stroke[1][i+1]], fill='black', width=3)\n", | ||
"\n", | ||
" img = img.resize((28, 28))\n", | ||
"\n", | ||
" image_array = img_to_array(img)\n", | ||
" image_array = preprocess_input(image_array)\n", | ||
" \n", | ||
" data_df = data_df.append({'img': tf.convert_to_tensor(image_array, dtype=tf.float32)}, ignore_index=True)\n", | ||
" \n", | ||
" \n", | ||
" print(data_df['img'].head())\n", | ||
" \n", | ||
" \n", | ||
" with open(pkl_path, 'rb') as f:\n", | ||
" label_names = pickle.load(f)\n", | ||
" \n", | ||
" for i in data['y']:\n", | ||
" data_df = data_df.append({'label': label_names[i]}, ignore_index=True)\n", | ||
"# labels.append(label_names[i])\n", | ||
" \n", | ||
" print(data['y'].head()\n", | ||
"\n", | ||
"\n", | ||
"root_path = os.getenv('HOME') + '/aiffel/Kaggle_Hackathon/data'\n", | ||
"dir_path = os.path.join(root_path, 'simple')\n", | ||
"pkl_path = os.path.join(root_path, 'label_names.pkl')\n", | ||
"test = data_pipeline(dir_path, pkl_path, 50)" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"id": "eebf25fa", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"import os\n", | ||
"import pandas as pd\n", | ||
"import ast\n", | ||
"from PIL import Image, ImageDraw\n", | ||
"import matplotlib.pyplot as plt\n", | ||
"import tensorflow as tf\n", | ||
"from tensorflow.keras.preprocessing.image import img_to_array\n", | ||
"from tensorflow.keras.applications.inception_v3 import preprocess_input\n", | ||
"import pickle\n", | ||
"\n", | ||
"def data_pipeline(dir_path, pkl_path, file_idx):\n", | ||
" # 데이터와 레이블을 담을 리스트 생성\n", | ||
" data_list = []\n", | ||
" label_list = []\n", | ||
"\n", | ||
" # 데이터 읽기\n", | ||
" data = pd.read_csv(os.path.join(dir_path, f'train_k{file_idx}'))\n", | ||
" drawing = data['drawing'].apply(ast.literal_eval).tolist() # drawing 열을 리스트로 변환\n", | ||
" \n", | ||
" with open(pkl_path, 'rb') as f:\n", | ||
" label_names = pickle.load(f)\n", | ||
"\n", | ||
" for draw_idx, label_idx in zip(drawing, data['y']):\n", | ||
" img = Image.new('L', (256, 256), color='white')\n", | ||
" draw = ImageDraw.Draw(img)\n", | ||
"\n", | ||
" for stroke in draw_idx:\n", | ||
" for i in range(len(stroke[0]) - 1):\n", | ||
" draw.line([stroke[0][i], stroke[1][i], stroke[0][i+1], stroke[1][i+1]], fill='black', width=3)\n", | ||
"\n", | ||
" img = img.resize((28, 28))\n", | ||
"\n", | ||
" image_array = img_to_array(img)\n", | ||
" image_array = preprocess_input(image_array)\n", | ||
"\n", | ||
" data_list.append(tf.convert_to_tensor(image_array, dtype=tf.float32))\n", | ||
" label_list.append(label_names[label_idx])\n", | ||
"\n", | ||
" # DataFrame 생성\n", | ||
" data_df = pd.DataFrame({'img': data_list, 'label': label_list})\n", | ||
"\n", | ||
" return data_df\n", | ||
"\n", | ||
"\n", | ||
"root_path = os.getenv('HOME') + '/aiffel/Kaggle_Hackathon/data'\n", | ||
"dir_path = os.path.join(root_path, 'simple')\n", | ||
"pkl_path = os.path.join(root_path, 'label_names.pkl')\n", | ||
"test = data_pipeline(dir_path, pkl_path, 50)" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"id": "d604126a", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [] | ||
} | ||
], | ||
"metadata": { | ||
"kernelspec": { | ||
"display_name": "Python 3 (ipykernel)", | ||
"language": "python", | ||
"name": "python3" | ||
}, | ||
"language_info": { | ||
"codemirror_mode": { | ||
"name": "ipython", | ||
"version": 3 | ||
}, | ||
"file_extension": ".py", | ||
"mimetype": "text/x-python", | ||
"name": "python", | ||
"nbconvert_exporter": "python", | ||
"pygments_lexer": "ipython3", | ||
"version": "3.9.7" | ||
} | ||
}, | ||
"nbformat": 4, | ||
"nbformat_minor": 5 | ||
} |
Oops, something went wrong.