diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml index f56a57afa..591ed9e81 100644 --- a/.github/workflows/benchmark.yml +++ b/.github/workflows/benchmark.yml @@ -22,7 +22,8 @@ jobs: - name: Checkout CredData uses: actions/checkout@v3 with: - repository: Samsung/CredData + repository: babenek/CredData + ref: jwt - name: Cache data id: cache-data @@ -62,7 +63,8 @@ jobs: - name: Checkout CredData uses: actions/checkout@v3 with: - repository: Samsung/CredData + repository: babenek/CredData + ref: jwt - name: Cache data id: cache-data @@ -148,187 +150,3 @@ jobs: # update cicd/benchmark.txt with uploaded artifact if a difference is found run: | diff CredSweeper/cicd/benchmark.txt benchmark.txt - -# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # - - performance_benchmark: - # put the benchmark in single job to keep constant environment during test - needs: [download_data] - - runs-on: ubuntu-latest - strategy: - fail-fast: false - matrix: - python-version: ["3.8", "3.9", "3.10", "3.11"] - - steps: - - - name: Checkout CredData - uses: actions/checkout@v3 - with: - repository: Samsung/CredData - - - name: Cache data - id: cache-data - uses: actions/cache@v3 - with: - path: data - key: cred-data-${{ hashFiles('snapshot.yaml') }} - - - name: Failure in case when cache missed - if: steps.cache-data.outputs.cache-hit != 'true' - run: exit 1 - - - name: Exclude very huge data - if: steps.cache-data.outputs.cache-hit == 'true' - run: rm -rf data/8* data/7* data/a* data/2* data/0* data/f* data/b* data/d* - - - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@v3 - with: - python-version: ${{ matrix.python-version }} - - - name: Add synthetic huge data - if: steps.cache-data.outputs.cache-hit == 'true' - run: python -c "for n in range(7654321):print(f'{n:08x}')" >data/test.text - - - name: Update PIP - run: python -m pip install --upgrade pip - - - name: Fix onnxruntime lib for released version 1.5.5 - todo remove it after new release - run: python -m pip install onnxruntime==1.15.1 - - - name: Install released CredSweeper - run: | - python -m pip install credsweeper - # check the banner - credsweeper --banner - - - name: Run performance benchmark RELEASE - run: | - START_TIME=$(date +%s) - /usr/bin/time --verbose credsweeper --log error --path data --save-json /dev/null - FINISH_TIME=$(date +%s) - RELEASE_TIME=$(( ${FINISH_TIME} - ${START_TIME} )) - if [ 0 -lt ${RELEASE_TIME} ]; then - echo Elapsed $(date -ud "@${RELEASE_TIME}" +"%H:%M:%S") - else - echo "Wrong result '${RELEASE_TIME}'" - exit 1 - fi - echo "RELEASE_TIME=${RELEASE_TIME}" >> $GITHUB_ENV - - - name: Uninstall released CredSweeper - run: | - python -m pip uninstall -y credsweeper - - - name: Checkout base CredSweeper - uses: actions/checkout@v3 - with: - ref: ${{ github.event.pull_request.base.sha }} - path: temp/CredSweeper.base - - - name: Install base CredSweeper - run: | - python -m pip install temp/CredSweeper.base - # check the banner - credsweeper --banner - - - name: Run performance benchmark BASE - run: | - START_TIME=$(date +%s) - /usr/bin/time --verbose credsweeper --log error --path data --save-json /dev/null - FINISH_TIME=$(date +%s) - BASE_TIME=$(( ${FINISH_TIME} - ${START_TIME} )) - if [ 0 -lt ${BASE_TIME} ]; then - echo Elapsed $(date -ud "@${BASE_TIME}" +"%H:%M:%S") - else - echo "Wrong result '${BASE_TIME}'" - exit 1 - fi - echo "BASE_TIME=${BASE_TIME}" >> $GITHUB_ENV - - - name: Checkout current CredSweeper - uses: actions/checkout@v3 - with: - ref: ${{ github.event.pull_request.head.sha }} - path: temp/CredSweeper.head - - - name: Install current CredSweeper - run: | - python -m pip install temp/CredSweeper.head - # check the banner - credsweeper --banner - - - name: Run performance benchmark CURRENT - run: | - START_TIME=$(date +%s) - /usr/bin/time --verbose credsweeper --log error --path data --save-json /dev/null - FINISH_TIME=$(date +%s) - HEAD_TIME=$(( ${FINISH_TIME} - ${START_TIME} )) - if [ 0 -lt ${HEAD_TIME} ]; then - echo Elapsed $(date -ud "@${HEAD_TIME}" +"%H:%M:%S") - else - echo "Wrong result '${HEAD_TIME}'" - exit 1 - fi - echo "HEAD_TIME=${HEAD_TIME}" >> $GITHUB_ENV - - - name: Compare results - run: | - exit_code=0 - LOW_DELTA=10 - THRESHOLD=250 - - # RELEASE - if [ ${RELEASE_TIME} -le ${HEAD_TIME} ]; then - d=$(( 1000 * ( ${HEAD_TIME} - ${RELEASE_TIME} ) / ${RELEASE_TIME} )) - echo "RELEASE_TIME (sec) = ${RELEASE_TIME}, current (sec) = ${HEAD_TIME}. Diff (% * 10): ${d}" - if [ $LOW_DELTA -ge ${d} ]; then - echo "Almost the same." - elif [ $THRESHOLD -lt ${d} ]; then - echo "Significantly Slowdown." - exit_code=1 - else - echo "Slowdown." - fi - else - d=$(( 1000 * ( ${RELEASE_TIME} - ${HEAD_TIME} ) / ${RELEASE_TIME} )) - echo "RELEASE_TIME (sec) = ${RELEASE_TIME}, current (sec) = ${HEAD_TIME}. Diff (% * 10): ${d}" - if [ $LOW_DELTA -ge ${d} ]; then - echo "Almost the same." - elif [ $THRESHOLD -lt ${d} ]; then - echo "Significantly speed-up." - else - echo "Speed-up." - fi - fi - - # BASE - if [ ${BASE_TIME} -le ${HEAD_TIME} ]; then - d=$(( 1000 * ( ${HEAD_TIME} - ${BASE_TIME} ) / ${BASE_TIME} )) - echo "BASE_TIME (sec) = ${BASE_TIME}, current (sec) = ${HEAD_TIME}. Diff (% * 10): ${d}" - if [ $LOW_DELTA -ge ${d} ]; then - echo "Almost the same." - elif [ $THRESHOLD -lt ${d} ]; then - echo "Significantly Slowdown." - exit_code=1 - else - echo "Slowdown." - fi - else - d=$(( 1000 * ( ${BASE_TIME} - ${HEAD_TIME} ) / ${BASE_TIME} )) - echo "BASE_TIME (sec) = ${BASE_TIME}, current (sec) = ${HEAD_TIME}. Diff (% * 10): ${d}" - if [ $LOW_DELTA -ge ${d} ]; then - echo "Almost the same." - elif [ $THRESHOLD -lt ${d} ]; then - echo "Significantly speed-up." - else - echo "Speed-up." - fi - fi - - exit ${exit_code} - -# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # - diff --git a/tests/__init__.py b/tests/__init__.py index a4935e8e3..26997147f 100644 --- a/tests/__init__.py +++ b/tests/__init__.py @@ -1,7 +1,7 @@ from pathlib import Path # total number of files in test samples -SAMPLES_FILES_COUNT: int = 120 +SAMPLES_FILES_COUNT: int = 121 # credentials count after scan SAMPLES_CRED_COUNT: int = 373 diff --git a/tests/data/depth_3.json b/tests/data/depth_3.json index 63a1c43a1..07956c926 100644 --- a/tests/data/depth_3.json +++ b/tests/data/depth_3.json @@ -5807,23 +5807,23 @@ { "api_validation": "NOT_AVAILABLE", "ml_validation": "VALIDATED_KEY", - "ml_probability": 0.99907, + "ml_probability": 0.82258, "rule": "JSON Web Token", "severity": "medium", "line_data_list": [ { - "line": "$payload = 'eyJhbGciOiJ0eXAifQ.eyJcaaF9xCe7shE0ENPiBlEJOpS'", + "line": "$payload = \"eyJhbGciOiJ0-_-_-_-_-_-.eyJcaaF9xCe7shE0ENPiBlEJOpS\"", "line_num": 1, "path": "tests/samples/json_web_token.hs", "info": "tests/samples/json_web_token.hs|RAW", - "value": "eyJhbGciOiJ0eXAifQ.eyJcaaF9xCe7shE0ENPiBlEJOpS", + "value": "eyJhbGciOiJ0-_-_-_-_-_-.eyJcaaF9xCe7shE0ENPiBlEJOpS", "value_start": 12, - "value_end": 58, + "value_end": 63, "variable": null, "entropy_validation": { "iterator": "BASE64_CHARS", - "entropy": 4.517508706965262, - "valid": true + "entropy": 3.6658808986352547, + "valid": false } } ] diff --git a/tests/data/doc.json b/tests/data/doc.json index 065e4fe91..62473d5f1 100644 --- a/tests/data/doc.json +++ b/tests/data/doc.json @@ -10409,23 +10409,23 @@ { "api_validation": "NOT_AVAILABLE", "ml_validation": "VALIDATED_KEY", - "ml_probability": 0.99907, + "ml_probability": 0.82258, "rule": "JSON Web Token", "severity": "medium", "line_data_list": [ { - "line": "$payload = 'eyJhbGciOiJ0eXAifQ.eyJcaaF9xCe7shE0ENPiBlEJOpS'", + "line": "$payload = \"eyJhbGciOiJ0-_-_-_-_-_-.eyJcaaF9xCe7shE0ENPiBlEJOpS\"", "line_num": 1, "path": "tests/samples/json_web_token.hs", "info": "tests/samples/json_web_token.hs|RAW", - "value": "eyJhbGciOiJ0eXAifQ.eyJcaaF9xCe7shE0ENPiBlEJOpS", + "value": "eyJhbGciOiJ0-_-_-_-_-_-.eyJcaaF9xCe7shE0ENPiBlEJOpS", "value_start": 12, - "value_end": 58, + "value_end": 63, "variable": null, "entropy_validation": { "iterator": "BASE64_CHARS", - "entropy": 4.517508706965262, - "valid": true + "entropy": 3.6658808986352547, + "valid": false } } ] diff --git a/tests/data/ml_threshold_0.json b/tests/data/ml_threshold_0.json index e60208899..4101fa797 100644 --- a/tests/data/ml_threshold_0.json +++ b/tests/data/ml_threshold_0.json @@ -7660,18 +7660,18 @@ "severity": "medium", "line_data_list": [ { - "line": "$payload = 'eyJhbGciOiJ0eXAifQ.eyJcaaF9xCe7shE0ENPiBlEJOpS'", + "line": "$payload = \"eyJhbGciOiJ0-_-_-_-_-_-.eyJcaaF9xCe7shE0ENPiBlEJOpS\"", "line_num": 1, "path": "tests/samples/json_web_token.hs", "info": "", - "value": "eyJhbGciOiJ0eXAifQ.eyJcaaF9xCe7shE0ENPiBlEJOpS", + "value": "eyJhbGciOiJ0-_-_-_-_-_-.eyJcaaF9xCe7shE0ENPiBlEJOpS", "value_start": 12, - "value_end": 58, + "value_end": 63, "variable": null, "entropy_validation": { "iterator": "BASE64_CHARS", - "entropy": 4.517508706965262, - "valid": true + "entropy": 3.6658808986352547, + "valid": false } } ] diff --git a/tests/data/output.json b/tests/data/output.json index 6268c6344..ecb3ff6bb 100644 --- a/tests/data/output.json +++ b/tests/data/output.json @@ -5663,23 +5663,23 @@ { "api_validation": "NOT_AVAILABLE", "ml_validation": "VALIDATED_KEY", - "ml_probability": 0.99907, + "ml_probability": 0.82258, "rule": "JSON Web Token", "severity": "medium", "line_data_list": [ { - "line": "$payload = 'eyJhbGciOiJ0eXAifQ.eyJcaaF9xCe7shE0ENPiBlEJOpS'", + "line": "$payload = \"eyJhbGciOiJ0-_-_-_-_-_-.eyJcaaF9xCe7shE0ENPiBlEJOpS\"", "line_num": 1, "path": "tests/samples/json_web_token.hs", "info": "", - "value": "eyJhbGciOiJ0eXAifQ.eyJcaaF9xCe7shE0ENPiBlEJOpS", + "value": "eyJhbGciOiJ0-_-_-_-_-_-.eyJcaaF9xCe7shE0ENPiBlEJOpS", "value_start": 12, - "value_end": 58, + "value_end": 63, "variable": null, "entropy_validation": { "iterator": "BASE64_CHARS", - "entropy": 4.517508706965262, - "valid": true + "entropy": 3.6658808986352547, + "valid": false } } ] diff --git a/tests/samples/false_jwt.eml b/tests/samples/false_jwt.eml index 485655c98..b9283e83e 100644 --- a/tests/samples/false_jwt.eml +++ b/tests/samples/false_jwt.eml @@ -1 +1,2 @@ -eyJAlZtHKjCmuF7VOfkYIlcd6iG7bz59JA3hELeC8hrlJfZ8z5C0j7JAEnQBTfy6rAPZmRBqU7k6 \ No newline at end of file +eyJAlZtHKjCmuF7VOfkYIlcd6iG7bz59JA3hELeC8hrlJfZ8z5C0j7JAEnQBTfy6rAPZmRBqU7k6 + diff --git a/tests/samples/json_web_token.hs b/tests/samples/json_web_token.hs index 47430a585..7aca073c1 100644 --- a/tests/samples/json_web_token.hs +++ b/tests/samples/json_web_token.hs @@ -1 +1,2 @@ -$payload = 'eyJhbGciOiJ0eXAifQ.eyJcaaF9xCe7shE0ENPiBlEJOpS' \ No newline at end of file +$payload = "eyJhbGciOiJ0-_-_-_-_-_-.eyJcaaF9xCe7shE0ENPiBlEJOpS" +