Skip to content

Commit

Permalink
Merge pull request #46 from thetjswo/main
Browse files Browse the repository at this point in the history
Data: 파이프라인 구축 중간 커밋
  • Loading branch information
thetjswo authored Feb 11, 2024
2 parents b5fd1bd + 99d1266 commit 6f5f594
Show file tree
Hide file tree
Showing 5 changed files with 330 additions and 1 deletion.
Binary file modified .DS_Store
Binary file not shown.
Binary file modified Preprocessing/.DS_Store
Binary file not shown.
Binary file added Preprocessing/seonjae/.DS_Store
Binary file not shown.
291 changes: 291 additions & 0 deletions Preprocessing/seonjae/preprocessing_train_df.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,291 @@
{
"cells": [
{
"cell_type": "markdown",
"id": "bc3ebba9",
"metadata": {},
"source": [
"# 서로 다른 모델과 분할된 데이터, 그 결과에 대한 병합\n",
"---\n",
"\n",
"## 개요\n",
"승순님이 만들어주신 100개의 chunk파일을 적절하게 사용하여 최소 3개의 모델을 학습시킨다. 각 모델의 성능을 병합하는 앙상블 기법을 사용하여 최종 결과를 산출한다. \n",
"\n",
"## 목차\n",
"* 데이터 압축해제 및 확인하기\n",
"* 파이프라인 구축하기\n",
"* 모델 구조 설계하기\n",
"* 각각의 모델 학습하기\n",
"* 학습 결과 병합하기"
]
},
{
"cell_type": "markdown",
"id": "3b08b963",
"metadata": {},
"source": [
"### STEP 1. 데이터 압축해제 및 확인하기\n",
"승순님이 만들어 놓은 청크 파일을 압축해제하고, 파일이 어떻게 구성되어 있는지 확인한다."
]
},
{
"cell_type": "code",
"execution_count": 1,
"id": "28027d21",
"metadata": {},
"outputs": [],
"source": [
"import os\n",
"import zipfile\n",
"\n",
"compressed_file_path = os.getenv('HOME') + '/aiffel/Kaggle_Hackathon/data/simple_shuffle_data.zip'\n",
"gz_file_path = os.getenv('HOME') + '/aiffel/Kaggle_Hackathon/data/compressed_simple'"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "fa370735",
"metadata": {},
"outputs": [
{
"ename": "KeyboardInterrupt",
"evalue": "",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mKeyboardInterrupt\u001b[0m Traceback (most recent call last)",
"\u001b[0;32m/tmp/ipykernel_253/1561374210.py\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[1;32m 4\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 5\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 6\u001b[0;31m \u001b[0mextract_zip\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mcompressed_file_path\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mgz_file_path\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 7\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 8\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'All Done!'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m/tmp/ipykernel_253/1561374210.py\u001b[0m in \u001b[0;36mextract_zip\u001b[0;34m(current_file_path, extract_file_path)\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mextract_zip\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mcurrent_file_path\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mextract_file_path\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 2\u001b[0m \u001b[0;32mwith\u001b[0m \u001b[0mzipfile\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mZipFile\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mcurrent_file_path\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'r'\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0mzip_ref\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 3\u001b[0;31m \u001b[0mzip_ref\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mextractall\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mextract_file_path\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 4\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 5\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m/opt/conda/lib/python3.9/zipfile.py\u001b[0m in \u001b[0;36mextractall\u001b[0;34m(self, path, members, pwd)\u001b[0m\n\u001b[1;32m 1631\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1632\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mzipinfo\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mmembers\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1633\u001b[0;31m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_extract_member\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mzipinfo\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mpath\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mpwd\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1634\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1635\u001b[0m \u001b[0;34m@\u001b[0m\u001b[0mclassmethod\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m/opt/conda/lib/python3.9/zipfile.py\u001b[0m in \u001b[0;36m_extract_member\u001b[0;34m(self, member, targetpath, pwd)\u001b[0m\n\u001b[1;32m 1686\u001b[0m \u001b[0;32mwith\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mopen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mmember\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mpwd\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mpwd\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0msource\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;31m \u001b[0m\u001b[0;31m\\\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1687\u001b[0m \u001b[0mopen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtargetpath\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m\"wb\"\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0mtarget\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1688\u001b[0;31m \u001b[0mshutil\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcopyfileobj\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0msource\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtarget\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1689\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1690\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mtargetpath\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m/opt/conda/lib/python3.9/shutil.py\u001b[0m in \u001b[0;36mcopyfileobj\u001b[0;34m(fsrc, fdst, length)\u001b[0m\n\u001b[1;32m 203\u001b[0m \u001b[0mfdst_write\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mfdst\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mwrite\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 204\u001b[0m \u001b[0;32mwhile\u001b[0m \u001b[0;32mTrue\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 205\u001b[0;31m \u001b[0mbuf\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mfsrc_read\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mlength\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 206\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0mbuf\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 207\u001b[0m \u001b[0;32mbreak\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m/opt/conda/lib/python3.9/zipfile.py\u001b[0m in \u001b[0;36mread\u001b[0;34m(self, n)\u001b[0m\n\u001b[1;32m 920\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_offset\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;36m0\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 921\u001b[0m \u001b[0;32mwhile\u001b[0m \u001b[0mn\u001b[0m \u001b[0;34m>\u001b[0m \u001b[0;36m0\u001b[0m \u001b[0;32mand\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_eof\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 922\u001b[0;31m \u001b[0mdata\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_read1\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mn\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 923\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mn\u001b[0m \u001b[0;34m<\u001b[0m \u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdata\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 924\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_readbuffer\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mdata\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m/opt/conda/lib/python3.9/zipfile.py\u001b[0m in \u001b[0;36m_read1\u001b[0;34m(self, n)\u001b[0m\n\u001b[1;32m 988\u001b[0m \u001b[0mdata\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_decompressor\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0munconsumed_tail\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 989\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mn\u001b[0m \u001b[0;34m>\u001b[0m \u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdata\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 990\u001b[0;31m \u001b[0mdata\u001b[0m \u001b[0;34m+=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_read2\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mn\u001b[0m \u001b[0;34m-\u001b[0m \u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdata\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 991\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 992\u001b[0m \u001b[0mdata\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_read2\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mn\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m/opt/conda/lib/python3.9/zipfile.py\u001b[0m in \u001b[0;36m_read2\u001b[0;34m(self, n)\u001b[0m\n\u001b[1;32m 1020\u001b[0m \u001b[0mn\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mmin\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mn\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_compress_left\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1021\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1022\u001b[0;31m \u001b[0mdata\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_fileobj\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mread\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mn\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1023\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_compress_left\u001b[0m \u001b[0;34m-=\u001b[0m \u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdata\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1024\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0mdata\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m/opt/conda/lib/python3.9/zipfile.py\u001b[0m in \u001b[0;36mread\u001b[0;34m(self, n)\u001b[0m\n\u001b[1;32m 740\u001b[0m \"Close the writing handle before trying to read.\")\n\u001b[1;32m 741\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_file\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mseek\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_pos\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 742\u001b[0;31m \u001b[0mdata\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_file\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mread\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mn\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 743\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_pos\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_file\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtell\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 744\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mdata\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;31mKeyboardInterrupt\u001b[0m: "
]
}
],
"source": [
"def extract_zip(current_file_path, extract_file_path):\n",
" with zipfile.ZipFile(current_file_path, 'r') as zip_ref:\n",
" zip_ref.extractall(extract_file_path)\n",
" \n",
" \n",
"extract_zip(compressed_file_path, gz_file_path)\n",
"\n",
"print('All Done!')"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "ed69ff0a",
"metadata": {},
"outputs": [],
"source": [
"import gzip\n",
"import shutil\n",
"\n",
"def extract_gz(extract_file_path):\n",
" for i in range(100):\n",
" compressed_file_path = os.path.join(gz_file_path,f'train_k{i}.gz')\n",
"# file_name = os.path.basename(compressed_file_path)\n",
" extract_path = os.path.join(extract_file_path, os.path.splitext(file_name)[0])\n",
"\n",
" with gzip.open(compressed_file_path, 'rb') as f_in:\n",
" with open(extract_path, 'wb') as f_out:\n",
" shutil.copyfileobj(f_in, f_out)\n",
" \n",
"extract_to_path = os.getenv('HOME') + '/aiffel/Kaggle_Hackathon/data/simple'\n",
"\n",
"data = extract_gz(extract_to_path)\n",
"\n",
"print('All Done!')"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "a24edcc2",
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"import os\n",
"\n",
"file_path = os.getenv('HOME') + '/aiffel/Kaggle_Hackathon/data/simple/train_k0'\n",
"data_df = pd.read_csv(file_path)\n",
"print(data_df.head())"
]
},
{
"cell_type": "markdown",
"id": "b28424d0",
"metadata": {},
"source": [
"# STEP 2. 파이프라인 구축하기\n",
"압축해제된 데이터들 중 학습에 사용할 데이터를 핸들링하여 모델에 바로 입력할 수 있도록 준비한다."
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "94749ccb",
"metadata": {},
"outputs": [
{
"ename": "SyntaxError",
"evalue": "invalid syntax (2595014484.py, line 50)",
"output_type": "error",
"traceback": [
"\u001b[0;36m File \u001b[0;32m\"/tmp/ipykernel_253/2595014484.py\"\u001b[0;36m, line \u001b[0;32m50\u001b[0m\n\u001b[0;31m root_path = os.getenv('HOME') + '/aiffel/Kaggle_Hackathon/data'\u001b[0m\n\u001b[0m ^\u001b[0m\n\u001b[0;31mSyntaxError\u001b[0m\u001b[0;31m:\u001b[0m invalid syntax\n"
]
}
],
"source": [
"import os\n",
"import pandas as pd\n",
"import ast\n",
"from PIL import Image, ImageDraw\n",
"import matplotlib.pyplot as plt\n",
"import tensorflow as tf\n",
"from tensorflow.keras.preprocessing.image import img_to_array\n",
"from tensorflow.keras.applications.inception_v3 import preprocess_input\n",
"import pickle\n",
"\n",
"def data_pipeline(dir_path, pkl_path, file_idx):\n",
" data_df = pd.DataFrame()\n",
" \n",
" data = pd.read_csv(os.path.join(dir_path, f'train_k{file_idx}'))\n",
"# print(data.head())\n",
" \n",
"# drawing = []\n",
"# for i in data['drawing']:\n",
"# drawing.append(ast.literal_eval(i))\n",
" \n",
" for draw_idx in data['drawing']:\n",
" img = Image.new('L', (256, 256), color='white')\n",
" draw = ImageDraw.Draw(img)\n",
"\n",
" for stroke in ast.literal_eval(draw_idx):\n",
" for i in range(len(stroke[0])-1):\n",
" draw.line([stroke[0][i], stroke[1][i], stroke[0][i+1], stroke[1][i+1]], fill='black', width=3)\n",
"\n",
" img = img.resize((28, 28))\n",
"\n",
" image_array = img_to_array(img)\n",
" image_array = preprocess_input(image_array)\n",
" \n",
" data_df = data_df.append({'img': tf.convert_to_tensor(image_array, dtype=tf.float32)}, ignore_index=True)\n",
" \n",
" \n",
" print(data_df['img'].head())\n",
" \n",
" \n",
" with open(pkl_path, 'rb') as f:\n",
" label_names = pickle.load(f)\n",
" \n",
" for i in data['y']:\n",
" data_df = data_df.append({'label': label_names[i]}, ignore_index=True)\n",
"# labels.append(label_names[i])\n",
" \n",
" print(data['y'].head()\n",
"\n",
"\n",
"root_path = os.getenv('HOME') + '/aiffel/Kaggle_Hackathon/data'\n",
"dir_path = os.path.join(root_path, 'simple')\n",
"pkl_path = os.path.join(root_path, 'label_names.pkl')\n",
"test = data_pipeline(dir_path, pkl_path, 50)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "eebf25fa",
"metadata": {},
"outputs": [],
"source": [
"import os\n",
"import pandas as pd\n",
"import ast\n",
"from PIL import Image, ImageDraw\n",
"import matplotlib.pyplot as plt\n",
"import tensorflow as tf\n",
"from tensorflow.keras.preprocessing.image import img_to_array\n",
"from tensorflow.keras.applications.inception_v3 import preprocess_input\n",
"import pickle\n",
"\n",
"def data_pipeline(dir_path, pkl_path, file_idx):\n",
" # 데이터와 레이블을 담을 리스트 생성\n",
" data_list = []\n",
" label_list = []\n",
"\n",
" # 데이터 읽기\n",
" data = pd.read_csv(os.path.join(dir_path, f'train_k{file_idx}'))\n",
" drawing = data['drawing'].apply(ast.literal_eval).tolist() # drawing 열을 리스트로 변환\n",
" \n",
" with open(pkl_path, 'rb') as f:\n",
" label_names = pickle.load(f)\n",
"\n",
" for draw_idx, label_idx in zip(drawing, data['y']):\n",
" img = Image.new('L', (256, 256), color='white')\n",
" draw = ImageDraw.Draw(img)\n",
"\n",
" for stroke in draw_idx:\n",
" for i in range(len(stroke[0]) - 1):\n",
" draw.line([stroke[0][i], stroke[1][i], stroke[0][i+1], stroke[1][i+1]], fill='black', width=3)\n",
"\n",
" img = img.resize((28, 28))\n",
"\n",
" image_array = img_to_array(img)\n",
" image_array = preprocess_input(image_array)\n",
"\n",
" data_list.append(tf.convert_to_tensor(image_array, dtype=tf.float32))\n",
" label_list.append(label_names[label_idx])\n",
"\n",
" # DataFrame 생성\n",
" data_df = pd.DataFrame({'img': data_list, 'label': label_list})\n",
"\n",
" return data_df\n",
"\n",
"\n",
"root_path = os.getenv('HOME') + '/aiffel/Kaggle_Hackathon/data'\n",
"dir_path = os.path.join(root_path, 'simple')\n",
"pkl_path = os.path.join(root_path, 'label_names.pkl')\n",
"test = data_pipeline(dir_path, pkl_path, 50)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "d604126a",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.7"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
Loading

0 comments on commit 6f5f594

Please sign in to comment.