Parse docx in --doc mode #1479
Workflow file for this run
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# This workflow runs benchmark | |
# Separation of jobs helps to cache data even benchmark is fail | |
name: Benchmark | |
on: | |
push: | |
branches: [ main ] | |
pull_request: | |
branches: [ main ] | |
jobs: | |
# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # | |
download_data: | |
runs-on: ubuntu-latest | |
steps: | |
- name: Checkout CredData | |
uses: actions/checkout@v3 | |
with: | |
repository: Samsung/CredData | |
- name: Cache data | |
id: cache-data | |
uses: actions/cache@v3 | |
with: | |
path: data | |
key: cred-data-${{ hashFiles('snapshot.yaml') }} | |
- name: Set up Python 3.8 | |
if: steps.cache-data.outputs.cache-hit != 'true' | |
uses: actions/setup-python@v3 | |
with: | |
python-version: "3.8" | |
- name: Update PIP | |
run: python -m pip install --upgrade pip | |
- name: Install requirements of CredData | |
if: steps.cache-data.outputs.cache-hit != 'true' | |
run: python -m pip install --requirement requirements.txt | |
- name: Generate Data Asset | |
if: steps.cache-data.outputs.cache-hit != 'true' | |
run: python download_data.py --data_dir data | |
# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # | |
run_benchmark: | |
needs: [download_data] | |
runs-on: ubuntu-latest | |
steps: | |
- name: Checkout CredData | |
uses: actions/checkout@v3 | |
with: | |
repository: Samsung/CredData | |
- name: Cache data | |
id: cache-data | |
uses: actions/cache@v3 | |
with: | |
path: data | |
key: cred-data-${{ hashFiles('snapshot.yaml') }} | |
- name: Failure in case when cache missed | |
if: steps.cache-data.outputs.cache-hit != 'true' | |
run: exit 1 | |
- name: Check Data Asset - DEBUG | |
if: steps.cache-data.outputs.cache-hit == 'true' | |
run: ls -al . && ls -al data | |
- name: Set up Python 3.8 | |
uses: actions/setup-python@v3 | |
with: | |
python-version: "3.8" | |
- name: Update PIP | |
run: python -m pip install --upgrade pip | |
- name: Install requirements of CredData | |
run: python -m pip install --requirement requirements.txt | |
- name: Fix onnxruntime lib for released version 1.5.5 - todo remove it after new release | |
run: python -m pip install onnxruntime==1.15.1 | |
- name: Checkout CredSweeper | |
if: ${{ 'pull_request' == github.event_name }} | |
uses: actions/checkout@v3 | |
with: | |
ref: ${{ github.event.pull_request.head.sha }} | |
path: temp/CredSweeper | |
- name: Patch benchmark for PR work | |
if: ${{ 'pull_request' == github.event_name }} | |
run: | | |
sed -i 's|CREDSWEEPER = "https://github.com/Samsung/CredSweeper.git"|CREDSWEEPER = "dummy://github.com/Samsung/CredSweeper.git"|' benchmark/common/constants.py | |
grep --with-filename --line-number 'dummy://github.com/Samsung/CredSweeper.git' benchmark/common/constants.py | |
- name: Run Benchmark | |
run: | | |
python -m benchmark --scanner credsweeper | tee credsweeper.log | |
- name: Get only results | |
run: | | |
head -n 12 credsweeper.log | tee benchmark.txt | |
tail -n 14 credsweeper.log | grep -v 'Time Elapsed:' | tee -a benchmark.txt | |
cp -vf ./temp/CredSweeper/output.json report.json | |
- name: Upload artifact | |
if: always() | |
uses: actions/upload-artifact@v3 | |
with: | |
name: report | |
path: report.json | |
- name: Upload artifact | |
if: always() | |
uses: actions/upload-artifact@v3 | |
with: | |
name: benchmark | |
path: benchmark.txt | |
- name: Verify benchmark scores of the PR | |
if: ${{ 'pull_request' == github.event_name }} | |
# update cicd/benchmark.txt with uploaded artifact if a difference is found | |
run: | | |
diff temp/CredSweeper/cicd/benchmark.txt benchmark.txt | |
- name: Checkout CredSweeper on push event | |
if: ${{ 'pull_request' != github.event_name }} | |
uses: actions/checkout@v3 | |
with: | |
ref: ${{ github.event.pull_request.head.sha }} | |
path: CredSweeper | |
- name: Verify benchmark scores on push event | |
if: ${{ 'pull_request' != github.event_name }} | |
# update cicd/benchmark.txt with uploaded artifact if a difference is found | |
run: | | |
diff CredSweeper/cicd/benchmark.txt benchmark.txt | |
# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # | |
performance_benchmark: | |
# put the benchmark in single job to keep constant environment during test | |
needs: [download_data] | |
runs-on: ubuntu-latest | |
strategy: | |
fail-fast: false | |
matrix: | |
python-version: ["3.8", "3.9", "3.10", "3.11"] | |
steps: | |
- name: Checkout CredData | |
uses: actions/checkout@v3 | |
with: | |
repository: Samsung/CredData | |
- name: Cache data | |
id: cache-data | |
uses: actions/cache@v3 | |
with: | |
path: data | |
key: cred-data-${{ hashFiles('snapshot.yaml') }} | |
- name: Failure in case when cache missed | |
if: steps.cache-data.outputs.cache-hit != 'true' | |
run: exit 1 | |
- name: Exclude very huge data | |
if: steps.cache-data.outputs.cache-hit == 'true' | |
run: rm -rf data/8* data/7* data/a* data/2* data/0* data/f* data/b* data/d* | |
- name: Set up Python ${{ matrix.python-version }} | |
uses: actions/setup-python@v3 | |
with: | |
python-version: ${{ matrix.python-version }} | |
- name: Add synthetic huge data | |
if: steps.cache-data.outputs.cache-hit == 'true' | |
run: python -c "for n in range(7654321):print(f'{n:08x}')" >data/test.text | |
- name: Update PIP | |
run: python -m pip install --upgrade pip | |
- name: Fix onnxruntime lib for released version 1.5.5 - todo remove it after new release | |
run: python -m pip install onnxruntime==1.15.1 | |
- name: Install released CredSweeper | |
run: | | |
python -m pip install credsweeper | |
# check the banner | |
credsweeper --banner | |
- name: Run performance benchmark RELEASE | |
run: | | |
START_TIME=$(date +%s) | |
/usr/bin/time --verbose credsweeper --log error --path data --save-json /dev/null | |
FINISH_TIME=$(date +%s) | |
RELEASE_TIME=$(( ${FINISH_TIME} - ${START_TIME} )) | |
if [ 0 -lt ${RELEASE_TIME} ]; then | |
echo Elapsed $(date -ud "@${RELEASE_TIME}" +"%H:%M:%S") | |
else | |
echo "Wrong result '${RELEASE_TIME}'" | |
exit 1 | |
fi | |
echo "RELEASE_TIME=${RELEASE_TIME}" >> $GITHUB_ENV | |
- name: Uninstall released CredSweeper | |
run: | | |
python -m pip uninstall -y credsweeper | |
- name: Checkout base CredSweeper | |
uses: actions/checkout@v3 | |
with: | |
ref: ${{ github.event.pull_request.base.sha }} | |
path: temp/CredSweeper.base | |
- name: Install base CredSweeper | |
run: | | |
python -m pip install temp/CredSweeper.base | |
# check the banner | |
credsweeper --banner | |
- name: Run performance benchmark BASE | |
run: | | |
START_TIME=$(date +%s) | |
/usr/bin/time --verbose credsweeper --log error --path data --save-json /dev/null | |
FINISH_TIME=$(date +%s) | |
BASE_TIME=$(( ${FINISH_TIME} - ${START_TIME} )) | |
if [ 0 -lt ${BASE_TIME} ]; then | |
echo Elapsed $(date -ud "@${BASE_TIME}" +"%H:%M:%S") | |
else | |
echo "Wrong result '${BASE_TIME}'" | |
exit 1 | |
fi | |
echo "BASE_TIME=${BASE_TIME}" >> $GITHUB_ENV | |
- name: Checkout current CredSweeper | |
uses: actions/checkout@v3 | |
with: | |
ref: ${{ github.event.pull_request.head.sha }} | |
path: temp/CredSweeper.head | |
- name: Install current CredSweeper | |
run: | | |
python -m pip install temp/CredSweeper.head | |
# check the banner | |
credsweeper --banner | |
- name: Run performance benchmark CURRENT | |
run: | | |
START_TIME=$(date +%s) | |
/usr/bin/time --verbose credsweeper --log error --path data --save-json /dev/null | |
FINISH_TIME=$(date +%s) | |
HEAD_TIME=$(( ${FINISH_TIME} - ${START_TIME} )) | |
if [ 0 -lt ${HEAD_TIME} ]; then | |
echo Elapsed $(date -ud "@${HEAD_TIME}" +"%H:%M:%S") | |
else | |
echo "Wrong result '${HEAD_TIME}'" | |
exit 1 | |
fi | |
echo "HEAD_TIME=${HEAD_TIME}" >> $GITHUB_ENV | |
- name: Compare results | |
run: | | |
exit_code=0 | |
LOW_DELTA=10 | |
THRESHOLD=250 | |
# RELEASE | |
if [ ${RELEASE_TIME} -le ${HEAD_TIME} ]; then | |
d=$(( 1000 * ( ${HEAD_TIME} - ${RELEASE_TIME} ) / ${RELEASE_TIME} )) | |
echo "RELEASE_TIME (sec) = ${RELEASE_TIME}, current (sec) = ${HEAD_TIME}. Diff (% * 10): ${d}" | |
if [ $LOW_DELTA -ge ${d} ]; then | |
echo "Almost the same." | |
elif [ $THRESHOLD -lt ${d} ]; then | |
echo "Significantly Slowdown." | |
exit_code=1 | |
else | |
echo "Slowdown." | |
fi | |
else | |
d=$(( 1000 * ( ${RELEASE_TIME} - ${HEAD_TIME} ) / ${RELEASE_TIME} )) | |
echo "RELEASE_TIME (sec) = ${RELEASE_TIME}, current (sec) = ${HEAD_TIME}. Diff (% * 10): ${d}" | |
if [ $LOW_DELTA -ge ${d} ]; then | |
echo "Almost the same." | |
elif [ $THRESHOLD -lt ${d} ]; then | |
echo "Significantly speed-up." | |
else | |
echo "Speed-up." | |
fi | |
fi | |
# BASE | |
if [ ${BASE_TIME} -le ${HEAD_TIME} ]; then | |
d=$(( 1000 * ( ${HEAD_TIME} - ${BASE_TIME} ) / ${BASE_TIME} )) | |
echo "BASE_TIME (sec) = ${BASE_TIME}, current (sec) = ${HEAD_TIME}. Diff (% * 10): ${d}" | |
if [ $LOW_DELTA -ge ${d} ]; then | |
echo "Almost the same." | |
elif [ $THRESHOLD -lt ${d} ]; then | |
echo "Significantly Slowdown." | |
exit_code=1 | |
else | |
echo "Slowdown." | |
fi | |
else | |
d=$(( 1000 * ( ${BASE_TIME} - ${HEAD_TIME} ) / ${BASE_TIME} )) | |
echo "BASE_TIME (sec) = ${BASE_TIME}, current (sec) = ${HEAD_TIME}. Diff (% * 10): ${d}" | |
if [ $LOW_DELTA -ge ${d} ]; then | |
echo "Almost the same." | |
elif [ $THRESHOLD -lt ${d} ]; then | |
echo "Significantly speed-up." | |
else | |
echo "Speed-up." | |
fi | |
fi | |
exit ${exit_code} | |
# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # | |