diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml new file mode 100644 index 0000000..4af93ff --- /dev/null +++ b/.github/workflows/ci.yml @@ -0,0 +1,64 @@ +name: CI + +on: + push: + branches: [master, main, "claude/**"] + pull_request: + +jobs: + test: + name: perl ${{ matrix.perl }} on ${{ matrix.os }} + runs-on: ${{ matrix.os }} + strategy: + fail-fast: false + matrix: + os: [ubuntu-latest] + perl: ["5.36", "5.38", "5.40"] + include: + - os: macos-latest + perl: "5.38" + + steps: + - uses: actions/checkout@v4 + + - name: Set up perl + uses: shogo82148/actions-setup-perl@v1 + with: + perl-version: ${{ matrix.perl }} + install-modules-with: cpanm + install-modules: | + ExtUtils::MakeMaker + JSON::PP + MIME::Base64 + Encode + Archive::Tar + IO::Compress::Gzip + IO::Uncompress::Gunzip + Test::More + + - name: Install optional deps (best effort) + run: | + cpanm --notest --quiet IO::Compress::Brotli || true + cpanm --notest --quiet Archive::Zip || true + cpanm --notest --quiet DBD::SQLite || true + cpanm --notest --quiet URI || true + + - name: Install brotli & zip CLIs + if: runner.os == 'Linux' + run: sudo apt-get update && sudo apt-get install -y brotli zip + + - name: perl -V + run: perl -V + + - name: Build + run: perl Makefile.PL && make + + - name: Test + run: prove -lr -It/lib t/ + + - name: Smoke run + run: | + ./bin/harx --version + ./bin/harx --help + ./bin/harx extract --help + ./bin/harx audit secrets --help || true diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..2d76685 --- /dev/null +++ b/.gitignore @@ -0,0 +1,14 @@ +/blib/ +/Makefile +/Makefile.old +/MYMETA.* +/pm_to_blib +/HAR-Extractor-* +/cover_db/ +/.build/ +*.bak +*.swp +*~ +.DS_Store +/tmp/ +/out/ diff --git a/CHANGELOG.md b/CHANGELOG.md new file mode 100644 index 0000000..7bb9423 --- /dev/null +++ b/CHANGELOG.md @@ -0,0 +1,57 @@ +# Changelog + +All notable changes to this project will be documented in this file. + +The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/), +and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). + +## [0.2.0] — 2025-05-05 + +### Added + +- Brand-new `harx` CLI with subcommands: `extract`, `validate`, `sanitize`, + `audit`, `convert`, `report`, `diff`, `merge`, `dedupe`, `slice`, `graphql`, + `websocket`, `endpoints`, `stats`, `completions`. +- Modular library `HAR::Extractor::*` (Parser, Decoder, Filter, Naming, + Writer, Manifest, Logger, URL, Magic). +- Filtering: URL regex/glob, domain glob, method, status spec (`2xx`, + `4xx,5xx`, ranges), MIME, MIME regex, size with K/M/G suffix, time window, + has-header, header-regex, has-cookie, body content regex, initiator, + scheme, cache state, HTTP protocol, plus a free `--where` boolean DSL. +- Decompression: gzip, deflate, brotli (perl module *or* CLI fallback), zstd. + Charset-aware text decoding. +- Audits: `secrets` (extensible JSON catalog), `headers` (per-host scoring), + `cookies` (Secure / HttpOnly / SameSite / Domain / size), `mixed-content`, + `cors`, `pii` (Luhn-validated cards, IBAN, locale-aware DNI/NIE for `es`). +- `sanitize` produces a redacted HAR (auth headers, cookies, credential + query params, optional body redaction). +- Conversion: curl (per-entry or combined script), wget, Postman v2.1, + Insomnia v4, Netscape cookies.txt jar. +- Reports: terminal `stats`, ASCII `waterfall`, `perf` anti-pattern audit, + fully self-contained interactive `html` report. +- `diff`, `merge`, `dedupe`, `slice`, `validate` HAR manipulation. +- Specialized extractors: `graphql` (operation-aware splitting), `websocket` + (frames → NDJSON), `endpoints` (with `:id`/`:uuid`/`:hex` normalization + and OpenAPI-lite output). +- Output formats: filesystem, zip (with Archive::Zip or system `zip`), + tar.gz (Archive::Tar), SQLite (with DBD::SQLite). +- Manifests: JSON, CSV, JSONL, SQLite. +- Comprehensive test suite (300+ assertions covering modules, integration, + CLI subprocess flows, edge cases). +- Documentation: README, USAGE reference, COOKBOOK with 20 recipes, man + page, GitHub Actions CI matrix. + +### Changed + +- Replaced legacy `har-extract.pl` (single-file script with interactive menu) + with the modular CLI. **No backwards compatibility shim**: previous users + must migrate to `harx extract`. + +### Removed + +- `har-extract.pl` (replaced). + +## [0.1.0] — 2019-11-24 + +- Initial release: a single 86-line Perl script that extracted multimedia + resources (images and videos) from HAR files via an interactive menu. diff --git a/Makefile.PL b/Makefile.PL new file mode 100644 index 0000000..9fe455a --- /dev/null +++ b/Makefile.PL @@ -0,0 +1,46 @@ +use strict; +use warnings; +use ExtUtils::MakeMaker; + +WriteMakefile( + NAME => 'HAR::Extractor', + AUTHOR => 'Saul Blanco Tejero <@elGolpista>', + VERSION_FROM => 'lib/HAR/Extractor.pm', + ABSTRACT => 'Extract, filter, audit and convert HTTP Archive (HAR) files', + LICENSE => 'gpl_3', + MIN_PERL_VERSION => '5.036', + EXE_FILES => ['bin/harx'], + PREREQ_PM => { + 'JSON::PP' => 0, + 'MIME::Base64' => 0, + 'Encode' => 0, + 'File::Path' => 0, + 'File::Spec' => 0, + 'File::Basename' => 0, + 'File::Temp' => 0, + 'Getopt::Long' => 0, + 'Digest::SHA' => 0, + 'Digest::MD5' => 0, + 'Time::HiRes' => 0, + 'Archive::Tar' => 0, + 'IO::Uncompress::Gunzip' => 0, + 'IO::Uncompress::Inflate' => 0, + }, + TEST_REQUIRES => { + 'Test::More' => '0.98', + }, + META_MERGE => { + 'meta-spec' => { version => 2 }, + resources => { + repository => { + type => 'git', + url => 'https://github.com/Saul-BT/har-extractor.git', + web => 'https://github.com/Saul-BT/har-extractor', + }, + bugtracker => { + web => 'https://github.com/Saul-BT/har-extractor/issues', + }, + }, + }, + clean => { FILES => 'HAR-Extractor-*' }, +); diff --git a/README.md b/README.md new file mode 100644 index 0000000..3f4e31c --- /dev/null +++ b/README.md @@ -0,0 +1,166 @@ +# har-extractor (`harx`) + +> Extract, filter, audit, sanitize and convert HTTP Archive (HAR) files. + +`harx` is a command-line toolkit for working with HAR files. It started as a +small Perl script that pulled images and videos out of HAR captures, and has +grown into a multi-purpose tool for web developers, QA engineers, security +testers, and performance analysts. + +## Features + +- **Extraction** — bodies, headers, cookies, request payloads, into a + filesystem tree, zip / tar.gz / SQLite, with a JSON / CSV / JSONL manifest. + Mirror mode (URL-paths-as-directories) and full-trace mode (one folder per + entry). +- **Powerful filters** — URL regex / glob, domain glob, method, status code + ranges (`2xx`, `404,500-599`), MIME, size with K/M/G suffix, time window, + header presence/regex, cookie presence, body regex, initiator, scheme, + cache state, HTTP version. Plus a tiny boolean DSL with `AND`/`OR`/`NOT`. +- **Decompression** — gzip, deflate, brotli (via `IO::Compress::Brotli` + *or* the `brotli` CLI as fallback), zstd via the `zstd` CLI. Charset-aware + text decoding. +- **Security audits** — `secrets` (JWT, AWS, GitHub PAT, Stripe, OpenAI, + PEM, ...), `headers` (CSP, HSTS, X-Frame-Options, X-Content-Type-Options, + Referrer-Policy, Permissions-Policy with per-host scoring), `cookies` + (Secure / HttpOnly / SameSite), `mixed-content`, `cors`, `pii` + (Luhn-validated cards, IBAN, locale-aware DNI/NIE for Spain). +- **Sanitize** — produce a redacted HAR safe to share: masks Authorization, + cookies, sensitive query params, and (with `--redact-bodies`) any matched + secret pattern in bodies. +- **Conversion** — to curl (per-entry or one combined bash script), wget, + Postman Collection v2.1, Insomnia v4 export, Netscape `cookies.txt` jar. +- **Reports** — terminal stats with top-N hosts/MIME/slow/large/duplicates and + redirect chains; ASCII waterfall; performance anti-patterns + (uncompressed compressible, no-cache static, render-blocking head scripts, + slow/large responses, redirect chains); fully self-contained interactive + HTML report with previews, syntax-highlighted bodies, and client-side + filters. +- **HAR manipulation** — `diff` two HARs (only-in-A / only-in-B / changed), + `merge` chronologically, `dedupe`, `slice` to a time window, `validate`. +- **Specialized extractors** — `graphql` (split by operation, write + `.graphql` + variables JSON), `websocket` (frames to NDJSON timeline), + `endpoints` (unique API inventory with `:id`/`:uuid` normalization, with + optional OpenAPI-lite output). + +## Quickstart + +```bash +# Inspect what you've got +harx validate capture.har +harx stats capture.har + +# Extract everything from one host as a JSON-manifested directory tree +harx extract -i capture.har -o ./out --bodies --headers \ + --domain '*.example.com' --status 2xx + +# Generate a self-contained HTML report +harx report html -i capture.har -o report.html + +# Find leaked secrets before sharing +harx audit secrets capture.har + +# Sanitize a HAR safe to attach to a bug report +harx sanitize -i capture.har -o redacted.har --redact-bodies + +# Replay an entire login flow +harx convert curl --combined -i capture.har -o replay.sh +bash replay.sh + +# Reverse-engineer a GraphQL API +harx graphql -i capture.har --format files -o ./gql + +# Diff two captures (e.g. before / after a fix) +harx diff before.har after.har +``` + +## Installation + +`harx` requires Perl 5.36+ and only **core** modules to run with full +functionality. Optional features auto-detect their dependencies: + +| Feature | Optional dependency | +| ------------------------------ | -------------------------------- | +| Brotli `Content-Encoding` | `IO::Compress::Brotli`, or `brotli` CLI | +| `--output-format zip` | `Archive::Zip`, or system `zip` | +| `--output-format sqlite` | `DBD::SQLite` | +| zstd `Content-Encoding` | system `zstd` | + +```bash +git clone https://github.com/Saul-BT/har-extractor.git +cd har-extractor +perl Makefile.PL && make && make test +sudo make install # installs `harx` and the HAR::Extractor modules +``` + +Or run from the source tree: + +```bash +./bin/harx --help +``` + +## Subcommands + +``` +harx extract Extract resources, optionally filtered, with manifests +harx validate Validate a HAR against the 1.2 schema +harx sanitize Produce a redacted HAR safe to share +harx audit secrets | headers | cookies | mixed-content | cors | pii +harx convert curl | wget | postman | insomnia | cookies +harx report stats | waterfall | perf | html +harx diff Compare two HAR files +harx merge Merge multiple HAR files chronologically +harx dedupe Drop duplicate entries +harx slice Trim to a time window +harx graphql Extract GraphQL operations to .graphql files +harx websocket Extract WebSocket frames to NDJSON +harx endpoints Inventory API endpoints (with :param normalization) +harx stats Aggregate statistics (alias for `report stats`) +harx completions Print bash / zsh / fish completion script +``` + +Run `harx --help` for per-command flags. See [docs/USAGE.md](docs/USAGE.md) +and [docs/COOKBOOK.md](docs/COOKBOOK.md) for the full reference and recipes. + +## Filters cheat-sheet + +``` +--url-regex 'PATTERN' --domain '*.example.com' --exclude-domain +--method GET,POST --status 2xx | --status 200,301-302 | 4xx,5xx +--mime application/json --mime-regex 'json|xml' +--min-size 100K --max-size 5M +--since '+30s' | ISO8601 --until ... +--has-header 'Set-Cookie' --header-regex 'X-Foo: bar.*' +--has-cookie sessionid --content-regex 'EMAIL' +--initiator script --secure / --no-secure +--from-cache --protocol http/2 +--where 'status=200 AND mime ~ json AND NOT domain ~ analytics' +``` + +## Exit codes + +``` +0 success +1 generic error +2 invalid usage / invalid HAR +3 no entries matched (extract subcommand) +4 I/O error +``` + +## Testing + +```bash +prove -lr -It/lib t/ +``` + +The test suite covers all modules and CLI paths (300+ assertions). Tests use +only core Perl modules; tests requiring optional dependencies skip cleanly +when those are absent. + +## License + +GPL-3.0-only — see [LICENSE](LICENSE). + +Originally written by Saúl Blanco Tejero ([@elGolpista](https://github.com/elGolpista)) +in 2019. Refactored, modularized and extended into a full toolkit while +preserving the original attribution. diff --git a/bin/harx b/bin/harx new file mode 100755 index 0000000..a80de6f --- /dev/null +++ b/bin/harx @@ -0,0 +1,52 @@ +#!/usr/bin/env perl +use v5.36; +use strict; +use warnings; +use FindBin qw($RealBin); +use lib "$RealBin/../lib"; +use HAR::Extractor::CLI; + +exit HAR::Extractor::CLI->run(@ARGV); + +__END__ + +=head1 NAME + +harx - extract, filter, audit, sanitize and convert HTTP Archive (HAR) files + +=head1 SYNOPSIS + + harx [global-opts] [options] [args] + + harx extract --input capture.har --output ./out --bodies --status 2xx + harx audit secrets capture.har + harx sanitize --input in.har --output redacted.har + harx convert curl capture.har --output curls/ + harx report html --input capture.har --output report.html + harx diff a.har b.har + harx graphql capture.har --output ./gql + +See C --help> for per-subcommand documentation, or the man +page L. + +=head1 DESCRIPTION + +C is a command-line toolkit for working with HAR (HTTP Archive) files. + +=head1 EXIT CODES + + 0 success + 1 generic error + 2 invalid usage / invalid HAR + 3 no entries matched (extract subcommand) + 4 I/O error + +=head1 AUTHOR + +Saul Blanco Tejero <@elGolpista> + +=head1 LICENSE + +GPL-3.0-only. + +=cut diff --git a/cpanfile b/cpanfile new file mode 100644 index 0000000..a9195b0 --- /dev/null +++ b/cpanfile @@ -0,0 +1,37 @@ +requires 'perl', '5.036'; + +# Core modules (always available with modern perl) +requires 'JSON::PP'; +requires 'MIME::Base64'; +requires 'Encode'; +requires 'File::Path'; +requires 'File::Spec'; +requires 'File::Basename'; +requires 'File::Temp'; +requires 'Getopt::Long'; +requires 'Digest::SHA'; +requires 'Digest::MD5'; +requires 'Time::HiRes'; +requires 'Archive::Tar'; +requires 'IO::Compress::Gzip'; +requires 'IO::Uncompress::Gunzip'; +requires 'IO::Uncompress::Inflate'; +requires 'IO::Uncompress::RawInflate'; + +# Optional (features degrade gracefully if missing) +recommends 'JSON::XS'; # faster JSON +recommends 'IO::Compress::Brotli'; # brotli decoding +recommends 'Archive::Zip'; # --output-format zip +recommends 'DBD::SQLite'; # --output-format sqlite +recommends 'File::LibMagic'; # better MIME detection +recommends 'URI'; # better URL parsing + +on 'test' => sub { + requires 'Test::More', '0.98'; +}; + +on 'develop' => sub { + recommends 'Perl::Critic'; + recommends 'Perl::Tidy'; + recommends 'Devel::Cover'; +}; diff --git a/docs/COOKBOOK.md b/docs/COOKBOOK.md new file mode 100644 index 0000000..299c7f4 --- /dev/null +++ b/docs/COOKBOOK.md @@ -0,0 +1,171 @@ +# `harx` Cookbook — recipes for real tasks + +## 1. Download all images from a captured page + +```bash +harx extract -i capture.har -o ./images \ + --mime-regex '^image/' --status 2xx --domain '*.example.com' +``` + +Files are bucketed by MIME under `./images/image/png/`, `./images/image/svg+xml/`, etc. + +## 2. Mirror a site as a navigable directory tree + +```bash +harx extract -i capture.har -o ./mirror --mirror --status 2xx +``` + +Every resource lands under `./mirror///`, ready for `python -m http.server`. + +## 3. Generate a reproducible bash script that replays the login + +```bash +harx extract -i capture.har -o /dev/null --no-bodies --domain auth.example.com +harx convert curl --combined -i capture.har -o replay.sh \ + --domain auth.example.com +chmod +x replay.sh +./replay.sh +``` + +## 4. Find leaked tokens before sharing a HAR with vendor support + +```bash +harx audit secrets capture.har +# → lists JWT, GitHub PAT, Stripe keys, AWS keys, PEM, password-in-URL, etc. + +# When happy, mask everything sensitive: +harx sanitize -i capture.har -o redacted.har --redact-bodies +harx audit secrets redacted.har # should be clean +``` + +## 5. Audit a site's response headers for hardening gaps + +```bash +harx audit headers capture.har +# +# [ 33/100] www.example.com (12 response(s) analysed) +# [HIGH] missing Content-Security-Policy on 12 response(s) +# [HIGH] missing Strict-Transport-Security on 12 response(s) +# [MEDIUM] missing X-Content-Type-Options: nosniff on 12 response(s) +``` + +## 6. Build a Postman collection from a real session + +```bash +harx convert postman -i capture.har -o api.postman_collection.json \ + --name "Acme API (live capture)" --domain api.example.com +# Import into Postman → ready to explore. +``` + +## 7. Compare performance before and after a fix + +```bash +harx report stats before.har > before.txt +harx report stats after.har > after.txt +diff before.txt after.txt +``` + +Or with the diff subcommand: + +```bash +harx diff before.har after.har +``` + +## 8. Find every GraphQL operation a site uses + +```bash +harx graphql -i capture.har --format files -o ./gql +ls ./gql/query ./gql/mutation +# .graphql + .vars.json per operation +``` + +## 9. Inventory the API surface of an app + +```bash +harx endpoints -i capture.har --format openapi-lite -o api.json +# → an OpenAPI 3.0 skeleton with normalised paths (e.g. /v1/users/:id). +``` + +## 10. Detect mixed content during HTTPS migration + +```bash +harx audit mixed-content capture.har +``` + +## 11. Catch dangerous CORS configurations + +```bash +harx audit cors capture.har +# Flags wildcard origins with credentials, reflected origins, etc. +``` + +## 12. Generate an interactive HTML report for triage + +```bash +harx report html -i capture.har -o report.html --title "Bug #1234" +xdg-open report.html +``` + +The HTML is fully self-contained — embedded entries, embedded image previews, +JS filtering, no external assets. + +## 13. Reuse a captured session with curl + +```bash +harx extract -i capture.har --cookies-jar cookies.txt -o /tmp/discard --no-bodies +curl -b cookies.txt -c cookies.txt https://example.com/api/me +``` + +## 14. Slice a HAR to a 30-second window + +```bash +harx slice -i capture.har -o login.har \ + --start '2025-05-05T10:00:00Z' --end '+30s' +``` + +## 15. Drop duplicate requests (e.g. polling loops) + +```bash +harx dedupe -i noisy.har -o clean.har +``` + +## 16. Extract WebSocket message timeline + +```bash +harx websocket -i capture.har -o frames.jsonl +jq -r '"\(.time)\t\(.type)\t\(.data[0:80])"' frames.jsonl +``` + +## 17. Find the largest/slowest resources + +```bash +harx report stats capture.har | head -50 +# Top 10 largest: +# 3.2 M https://example.com/main.js +# 1.1 M https://example.com/hero.png +# Top 10 slowest: +# 5230 ms https://api/slow-endpoint +``` + +## 18. Detect render-blocking JS / poor caching of static assets + +```bash +harx report perf capture.har +# render-blocking-script: sync + + +EOH + return $html; +} + +sub _esc ($s) { + $s //= ''; + $s =~ s/&/&/g; $s =~ s//>/g; $s =~ s/"/"/g; + return $s; +} + +1; diff --git a/lib/HAR/Extractor/Report/Perf.pm b/lib/HAR/Extractor/Report/Perf.pm new file mode 100644 index 0000000..09e5232 --- /dev/null +++ b/lib/HAR/Extractor/Report/Perf.pm @@ -0,0 +1,134 @@ +package HAR::Extractor::Report::Perf; +use v5.36; +use strict; +use warnings; +use HAR::Extractor::Entry; + +# Performance anti-pattern detection. Returns array of findings. +sub audit ($har) { + my @findings; + my $idx = 0; + for my $raw (@{ $har->{log}{entries} // [] }) { + my $e = HAR::Extractor::Entry->new($raw, $idx++); + + # Compressible text without compression + my $mime = $e->mime; + if ($mime =~ m{^text/|^application/(json|xml|javascript|x-javascript|ld\+json|graphql|xhtml\+xml)}) { + my $ce = $e->header('response', 'content-encoding'); + my $size = $e->size; + if (!defined $ce && $size > 1024) { + push @findings, { + entry_index => $e->index, + severity => 'medium', + issue => 'compressible-not-compressed', + detail => "$mime, $size bytes uncompressed", + url => $e->url, + }; + } + } + + # Cache headers absent on static assets + if ($mime =~ m{^image/|font/|^text/css|^application/javascript}) { + my $cc = $e->header('response', 'cache-control') // ''; + my $exp = $e->header('response', 'expires') // ''; + if ($cc !~ /max-age|public|immutable/i && !$exp) { + push @findings, { + entry_index => $e->index, + severity => 'low', + issue => 'static-no-cache', + detail => 'no Cache-Control or Expires header', + url => $e->url, + }; + } + if ($cc =~ /\bno-store\b|\bno-cache\b/i) { + push @findings, { + entry_index => $e->index, + severity => 'medium', + issue => 'static-no-store', + detail => "Cache-Control: $cc on static asset", + url => $e->url, + }; + } + } + + # Long redirect chains + if ($e->status >= 300 && $e->status < 400) { + my $loc = $e->header('response', 'location'); + push @findings, { + entry_index => $e->index, + severity => 'low', + issue => 'redirect', + detail => "redirect $e->{raw}{response}{status} -> " . ($loc // '?'), + url => $e->url, + } if defined $loc; + } + + # Slow request (>2s) + if ($e->time_ms > 2000) { + push @findings, { + entry_index => $e->index, + severity => $e->time_ms > 5000 ? 'high' : 'medium', + issue => 'slow-request', + detail => sprintf('%.0f ms', $e->time_ms), + url => $e->url, + }; + } + + # Large response (>1 MB) + my $size = $e->size; $size = 0 if $size < 0; + if ($size > 1024 * 1024) { + push @findings, { + entry_index => $e->index, + severity => $size > 5 * 1024 * 1024 ? 'high' : 'medium', + issue => 'large-response', + detail => sprintf('%.1f MB', $size / 1024 / 1024), + url => $e->url, + }; + } + } + + # Render-blocking heuristic: synchronous JS/CSS in + # This requires parsing HTML; we'll do a simple regex. + require HAR::Extractor::Decoder; + my $i2 = 0; + for my $raw (@{ $har->{log}{entries} // [] }) { + my $e = HAR::Extractor::Entry->new($raw, $i2++); + next unless $e->mime eq 'text/html'; + my $body = HAR::Extractor::Decoder::decode_entry($e->raw); + next unless $body && $body->{response} && $body->{response}{is_text}; + my $html = $body->{response}{text} // ''; + my $head = $html =~ /]*>(.*?)<\/head>/is ? $1 : $html; + while ($head =~ /]*)\bsrc=["']([^"']+)["']([^>]*)>/gi) { + my $attrs = "$1 $3"; + next if $attrs =~ /\b(async|defer|module)\b/i; + push @findings, { + entry_index => $e->index, + severity => 'medium', + issue => 'render-blocking-script', + detail => 'sync '; +my $har = make_har( + make_entry( + url => 'https://www.example.com/', + mime => 'text/html', + body => $html, + ), + # And a parallel HTTP request + make_entry( + url => 'http://insecure.example.com/track.gif', + mime => 'image/gif', + ), +); + +my $f = HAR::Extractor::Audit::MixedContent::audit($har); +ok scalar(@$f) >= 2, 'detects mixed content (entry + html attribute)' or diag explain $f; + +my @kinds = map { $_->{kind} } @$f; +ok grep({ $_ eq 'http_request' } @kinds), 'http_request kind'; +ok grep({ $_ eq 'html_attribute' } @kinds), 'html_attribute kind'; + +# Pure HTTPS HAR should produce nothing +my $clean = make_har( + make_entry(url => 'https://x.com/', mime => 'text/html', body => ''), +); +my $clean_f = HAR::Extractor::Audit::MixedContent::audit($clean); +is scalar(@$clean_f), 0, 'no findings on clean HTTPS-only HAR'; + +done_testing; diff --git a/t/13-audit-cors.t b/t/13-audit-cors.t new file mode 100644 index 0000000..184cc67 --- /dev/null +++ b/t/13-audit-cors.t @@ -0,0 +1,32 @@ +use strict; +use warnings; +use Test::More; +use lib 't/lib'; +use TestHelpers qw(make_har make_entry); +use HAR::Extractor::Audit::CORS; + +my $har = make_har( + make_entry( + url => 'https://api.example.com/danger', + response_headers => [ + { name => 'Access-Control-Allow-Origin', value => '*' }, + { name => 'Access-Control-Allow-Credentials', value => 'true' }, + ], + ), + make_entry( + url => 'https://api.example.com/safe', + response_headers => [ + { name => 'Access-Control-Allow-Origin', value => 'https://app.example.com' }, + { name => 'Access-Control-Allow-Credentials', value => 'true' }, + ], + request_headers => [ + { name => 'Origin', value => 'https://attacker.com' }, + ], + ), +); + +my $f = HAR::Extractor::Audit::CORS::audit($har); +my @critical = grep { $_->{severity} eq 'critical' } @$f; +ok scalar(@critical) >= 1, 'wildcard + creds is critical' or diag explain $f; + +done_testing; diff --git a/t/14-audit-pii.t b/t/14-audit-pii.t new file mode 100644 index 0000000..1e10cd8 --- /dev/null +++ b/t/14-audit-pii.t @@ -0,0 +1,33 @@ +use strict; +use warnings; +use Test::More; +use lib 't/lib'; +use TestHelpers qw(make_har make_entry); +use HAR::Extractor::Audit::PII; + +my $har = make_har( + make_entry( + url => 'https://example.com/profile', + body => 'email: alice@example.com phone: +34 612345678 IP: 10.0.0.5 cc: 4111 1111 1111 1111', + mime => 'text/plain', + ), +); + +my $f = HAR::Extractor::Audit::PII::audit($har, locale => 'es'); +my %ids = map { $_->{pattern_id} => 1 } @$f; +ok $ids{email}, 'email detected'; +ok $ids{phone_es}, 'phone (es) detected'; +ok $ids{ipv4}, 'ipv4 detected'; +ok $ids{cc_pan}, 'credit card with Luhn ok detected'; + +# Invalid Luhn should NOT be flagged +my $clean = make_har(make_entry(body => '4111 1111 1111 1112')); +my $f2 = HAR::Extractor::Audit::PII::audit($clean); +my @cc = grep { $_->{pattern_id} eq 'cc_pan' } @$f2; +is scalar(@cc), 0, 'invalid Luhn rejected'; + +# Snippet is redacted +my ($email_finding) = grep { $_->{pattern_id} eq 'email' } @$f; +unlike $email_finding->{snippet}, qr/alice\@example\.com/, 'email is redacted in output'; + +done_testing; diff --git a/t/15-sanitize.t b/t/15-sanitize.t new file mode 100644 index 0000000..8b4f473 --- /dev/null +++ b/t/15-sanitize.t @@ -0,0 +1,49 @@ +use strict; +use warnings; +use Test::More; +use lib 't/lib'; +use TestHelpers qw(make_har make_entry); +use HAR::Extractor::Sanitize; +use HAR::Extractor::Audit::Secrets; + +my $har = make_har( + make_entry( + url => 'https://example.com/api?token=topsecret123&public=ok', + request_headers => [ + { name => 'Authorization', value => 'Bearer secret123' }, + { name => 'X-Trace-Id', value => 'safe-value' }, + ], + cookies_response => [{ name => 'sid', value => 'abc' }], + response_headers => [ + { name => 'Set-Cookie', value => 'sid=abc; HttpOnly' }, + { name => 'Content-Type', value => 'application/json' }, + ], + body => '{"github":"' . 'gh' . 'p_' . ('A' x 36) . '"}', + mime => 'application/json', + ), +); + +my $report = HAR::Extractor::Sanitize::sanitize($har, redact_bodies => 1); +ok $report->{headers} >= 2, 'headers redacted (Authorization + Cookie if present)'; +ok $report->{cookies} >= 1, 'cookies redacted'; +ok $report->{urls} >= 1, 'URL query params redacted'; +ok $report->{body_matches} >= 1, 'body secret redacted'; + +# Verify redactions in place +my $req_hdrs = $har->{log}{entries}[0]{request}{headers}; +my ($auth) = grep { $_->{name} eq 'Authorization' } @$req_hdrs; +is $auth->{value}, '[REDACTED]', 'authorization header replaced'; + +# X-Trace-Id was NOT redacted +my ($trace) = grep { $_->{name} eq 'X-Trace-Id' } @$req_hdrs; +is $trace->{value}, 'safe-value', 'safe header preserved'; + +# URL token redacted but public remains +like $har->{log}{entries}[0]{request}{url}, qr/token=REDACTED/, 'token query redacted'; +like $har->{log}{entries}[0]{request}{url}, qr/public=ok/, 'public query preserved'; + +# After sanitize, secrets scan should be clean +my $f = HAR::Extractor::Audit::Secrets::scan($har); +is scalar(grep { $_->{pattern_id} eq 'github_pat' } @$f), 0, 'no github_pat after sanitize'; + +done_testing; diff --git a/t/16-convert-curl.t b/t/16-convert-curl.t new file mode 100644 index 0000000..d44656b --- /dev/null +++ b/t/16-convert-curl.t @@ -0,0 +1,58 @@ +use strict; +use warnings; +use Test::More; +use lib 't/lib'; +use TestHelpers qw(make_har make_entry); +use HAR::Extractor::Entry; +use HAR::Extractor::Convert::Curl; + +my $entry = HAR::Extractor::Entry->new(make_entry( + url => 'https://api.example.com/v1/items', + method => 'POST', + request_headers => [ + { name => 'Authorization', value => 'Bearer abc' }, + { name => 'Content-Type', value => 'application/json' }, + { name => 'Host', value => 'api.example.com' }, # should be skipped + { name => ':path', value => '/v1/items' }, # pseudo header skipped + ], + request_body => { mimeType => 'application/json', text => '{"x":1}' }, +), 0); + +my $cmd = HAR::Extractor::Convert::Curl::for_entry($entry); +like $cmd, qr/^curl/, 'starts with curl'; +like $cmd, qr/-X 'POST'/, 'method flag'; +like $cmd, qr/-H 'Authorization: Bearer abc'/, 'auth header included'; +like $cmd, qr/--data-raw '\{"x":1\}'/, 'request body as data-raw'; +unlike $cmd, qr/-H 'Host:/, 'Host header skipped'; +unlike $cmd, qr/:path/, 'pseudo header skipped'; +like $cmd, qr/'https:\/\/api\.example\.com\/v1\/items'/, 'final URL quoted'; + +# Single-quote escape +{ + my $e = HAR::Extractor::Entry->new(make_entry( + url => "https://x.com/?q=it's", + ), 0); + my $c = HAR::Extractor::Convert::Curl::for_entry($e); + ok $c =~ /'\\''/, 'single quote escaped via posix bash trick'; +} + +# Combined output is a script +{ + my $entries = [ HAR::Extractor::Entry->new(make_entry(url => 'https://x.com/a'), 0), + HAR::Extractor::Entry->new(make_entry(url => 'https://x.com/b'), 1) ]; + my $script = HAR::Extractor::Convert::Curl::render_all($entries, combined => 1); + like $script, qr/^#!\/usr\/bin\/env bash/, 'shebang'; + like $script, qr/x\.com\/a/, 'first url'; + like $script, qr/x\.com\/b/, 'second url'; +} + +# Per-entry files +{ + my $entries = [ HAR::Extractor::Entry->new(make_entry(url => 'https://x.com/a'), 0) ]; + my $files = HAR::Extractor::Convert::Curl::render_all($entries); + is ref $files, 'ARRAY', 'returns arrayref of files'; + like $files->[0]{filename}, qr/00000-/, 'has zero-padded index'; + like $files->[0]{content}, qr/^#!.*bash.*curl/s, 'has shebang and curl'; +} + +done_testing; diff --git a/t/17-convert-postman.t b/t/17-convert-postman.t new file mode 100644 index 0000000..d4af043 --- /dev/null +++ b/t/17-convert-postman.t @@ -0,0 +1,41 @@ +use strict; +use warnings; +use Test::More; +use lib 't/lib'; +use TestHelpers qw(make_har make_entry); +use HAR::Extractor::Entry; +use HAR::Extractor::Convert::Postman; +use JSON::PP; + +my @entries = ( + HAR::Extractor::Entry->new(make_entry( + url => 'https://api.example.com/users?page=1', + method => 'GET', + request_headers => [{ name => 'Accept', value => 'application/json' }], + ), 0), + HAR::Extractor::Entry->new(make_entry( + url => 'https://api.example.com/users', + method => 'POST', + request_body => { mimeType => 'application/json', text => '{"name":"alice"}' }, + ), 1), +); + +my $json = HAR::Extractor::Convert::Postman::render(\@entries, name => 'Test API'); +my $col = decode_json($json); + +is $col->{info}{name}, 'Test API', 'collection name'; +like $col->{info}{schema}, qr/v2\.1/, 'schema v2.1'; +is scalar @{ $col->{item} }, 2, 'two items'; + +my $get = $col->{item}[0]; +is $get->{request}{method}, 'GET', 'GET method'; +is_deeply $get->{request}{url}{host}, ['api','example','com'], 'host split'; +is_deeply $get->{request}{url}{path}, ['users'], 'path split'; +ok scalar @{ $get->{request}{url}{query} }, 'query parsed'; + +my $post = $col->{item}[1]; +is $post->{request}{method}, 'POST', 'POST method'; +is $post->{request}{body}{mode}, 'raw', 'raw body mode'; +like $post->{request}{body}{raw}, qr/alice/, 'body content present'; + +done_testing; diff --git a/t/18-convert-jars.t b/t/18-convert-jars.t new file mode 100644 index 0000000..a9e9f7f --- /dev/null +++ b/t/18-convert-jars.t @@ -0,0 +1,55 @@ +use strict; +use warnings; +use Test::More; +use lib 't/lib'; +use TestHelpers qw(make_har make_entry); +use File::Temp qw(tempfile); +use HAR::Extractor::Convert::CookiesJar; +use HAR::Extractor::Convert::Insomnia; +use HAR::Extractor::Convert::Wget; +use HAR::Extractor::Entry; +use JSON::PP; + +# CookiesJar: write Netscape format +{ + my $har = make_har( + make_entry( + url => 'https://example.com/login', + cookies_response => [ + { name => 'sid', value => 'abc', secure => JSON::PP::true, path => '/', domain => '.example.com', expires => '2030-01-01T00:00:00Z' }, + ], + ), + ); + my (undef, $path) = tempfile(SUFFIX => '.txt', UNLINK => 1); + HAR::Extractor::Convert::CookiesJar::write($har, $path); + open my $fh, '<', $path or die $!; + my $content = do { local $/; <$fh> }; + close $fh; + like $content, qr/Netscape HTTP Cookie File/, 'header line present'; + like $content, qr/\.example\.com\tTRUE\t\/\tTRUE\t\d+\tsid\tabc/, 'cookie line'; +} + +# Insomnia +{ + my $entries = [ HAR::Extractor::Entry->new(make_entry(url => 'https://x.com/a', method => 'GET'), 0) ]; + my $json = HAR::Extractor::Convert::Insomnia::render($entries, name => 'Test'); + my $doc = decode_json($json); + is $doc->{__export_format}, 4, 'insomnia v4 format'; + is scalar(grep { $_->{_type} eq 'request' } @{ $doc->{resources} }), 1, 'one request resource'; +} + +# Wget +{ + my $e = HAR::Extractor::Entry->new(make_entry( + url => 'https://x.com/data', method => 'POST', + request_headers => [{ name => 'X-Foo', value => 'bar' }], + request_body => { mimeType => 'text/plain', text => 'payload' }, + ), 0); + my $cmd = HAR::Extractor::Convert::Wget::for_entry($e); + like $cmd, qr/^wget/, 'wget command'; + like $cmd, qr/--method='POST'/, 'method flag'; + like $cmd, qr/--header='X-Foo: bar'/, 'header flag'; + like $cmd, qr/--body-data='payload'/, 'body flag'; +} + +done_testing; diff --git a/t/19-report-stats.t b/t/19-report-stats.t new file mode 100644 index 0000000..8bf392d --- /dev/null +++ b/t/19-report-stats.t @@ -0,0 +1,28 @@ +use strict; +use warnings; +use Test::More; +use lib 't/lib'; +use TestHelpers qw(make_har make_entry); +use HAR::Extractor::Report::Stats; + +my $har = make_har( + make_entry(url => 'https://a.com/1', mime => 'application/json', status => 200, time_ms => 100), + make_entry(url => 'https://a.com/1', mime => 'application/json', status => 200, time_ms => 100), # dup + make_entry(url => 'https://a.com/2', mime => 'image/png', status => 200, time_ms => 50), + make_entry(url => 'https://b.com/x', mime => 'text/html', status => 404, time_ms => 5000), +); + +my $s = HAR::Extractor::Report::Stats::compute($har); +is $s->{total_entries}, 4, 'total entries'; +is $s->{unique_hosts}, 2, 'unique hosts'; +ok $s->{by_status}{200} == 3, '200 count'; +ok $s->{by_status}{404} == 1, '404 count'; + +my @hosts = map { $_->{host} } @{ $s->{by_host} }; +ok grep({ $_ eq 'a.com' } @hosts), 'a.com in by_host'; +ok scalar(@{ $s->{duplicates} }) >= 1, 'at least one duplicate detected'; + +# top_slow first entry should be 5000ms +is $s->{top_slow}[0]{time_ms}, 5000, 'slowest first'; + +done_testing; diff --git a/t/20-report-html.t b/t/20-report-html.t new file mode 100644 index 0000000..8cbe9ec --- /dev/null +++ b/t/20-report-html.t @@ -0,0 +1,21 @@ +use strict; +use warnings; +use Test::More; +use lib 't/lib'; +use TestHelpers qw(make_har make_entry); +use HAR::Extractor::Report::HTML; + +my $har = make_har( + make_entry(url => 'https://a.com/index.html', mime => 'text/html', body => '

hi

'), + make_entry(url => 'https://a.com/data.json', mime => 'application/json', body => '{"x":1}'), +); + +my $html = HAR::Extractor::Report::HTML::render($har, title => 'My Report'); + +like $html, qr/My Report<\/title>/, 'title'; +like $html, qr/<table id="t">/, 'table present'; +like $html, qr/const ENTRIES =/, 'JS data injected'; +like $html, qr/<h1>hi<\/h1>|<h1>hi<\/h1>/, 'preview text or escaped'; +like $html, qr/a\.com\/data\.json/, 'URL appears in JSON data'; + +done_testing; diff --git a/t/21-report-perf-waterfall.t b/t/21-report-perf-waterfall.t new file mode 100644 index 0000000..d3252a5 --- /dev/null +++ b/t/21-report-perf-waterfall.t @@ -0,0 +1,51 @@ +use strict; +use warnings; +use Test::More; +use lib 't/lib'; +use TestHelpers qw(make_har make_entry); +use HAR::Extractor::Report::Perf; +use HAR::Extractor::Report::Waterfall; + +# Perf: large uncompressed JSON, slow request, no cache on image +my $har = make_har( + make_entry( + url => 'https://example.com/data.json', + mime => 'application/json', + body => 'x' x 5000, + ), + make_entry( + url => 'https://example.com/slow', + mime => 'text/html', + body => '<html></html>', + time_ms => 4000, + ), + make_entry( + url => 'https://example.com/img.png', + mime => 'image/png', + body => 'x', + ), + make_entry( + url => 'https://example.com/render-blocking', + mime => 'text/html', + body => '<html><head><script src="https://cdn/script.js"></script></head></html>', + ), +); + +my $f = HAR::Extractor::Report::Perf::audit($har); +my %issues = map { $_->{issue} => 1 } @$f; +ok $issues{'compressible-not-compressed'}, 'uncompressed compressible detected'; +ok $issues{'slow-request'}, 'slow request detected'; +ok $issues{'static-no-cache'}, 'no cache on static asset'; +ok $issues{'render-blocking-script'}, 'render-blocking script detected'; + +# Waterfall: rendering should not crash +my $har_w = make_har( + make_entry(url => 'https://x.com/a', started => '2025-01-01T00:00:00.000Z', time_ms => 100), + make_entry(url => 'https://x.com/b', started => '2025-01-01T00:00:00.500Z', time_ms => 200), +); +my $out = HAR::Extractor::Report::Waterfall::render($har_w, width => 40); +like $out, qr/waterfall:/, 'waterfall header'; +like $out, qr/x\.com\/a/, 'first url present'; +like $out, qr/x\.com\/b/, 'second url present'; + +done_testing; diff --git a/t/22-diff-merge-dedupe-slice.t b/t/22-diff-merge-dedupe-slice.t new file mode 100644 index 0000000..52ac04a --- /dev/null +++ b/t/22-diff-merge-dedupe-slice.t @@ -0,0 +1,65 @@ +use strict; +use warnings; +use Test::More; +use lib 't/lib'; +use TestHelpers qw(make_har make_entry); +use HAR::Extractor::Diff; +use HAR::Extractor::Merge; +use HAR::Extractor::Dedupe; +use HAR::Extractor::Slice; + +# Diff +{ + my $a = make_har( + make_entry(url => 'https://x.com/a'), + make_entry(url => 'https://x.com/b', status => 200), + ); + my $b = make_har( + make_entry(url => 'https://x.com/a'), + make_entry(url => 'https://x.com/b', status => 500), + make_entry(url => 'https://x.com/c'), + ); + my $d = HAR::Extractor::Diff::diff($a, $b); + is scalar @{ $d->{only_in_a} }, 0, 'no only_in_a'; + is scalar @{ $d->{only_in_b} }, 1, 'one only_in_b'; + is $d->{only_in_b}[0]{url}, 'https://x.com/c', 'identifies new URL'; + is scalar @{ $d->{changed} }, 1, 'one changed'; + is $d->{changed}[0]{diffs}[0][0], 'status', 'change type is status'; +} + +# Merge +{ + my $a = make_har(make_entry(url => 'https://a/1', started => '2025-01-01T00:00:00.000Z')); + my $b = make_har(make_entry(url => 'https://a/2', started => '2025-01-01T00:00:01.000Z')); + my $m = HAR::Extractor::Merge::merge($a, $b); + is scalar @{ $m->{log}{entries} }, 2, 'merged 2 entries'; + is $m->{log}{entries}[0]{request}{url}, 'https://a/1', 'sorted by time'; +} + +# Dedupe +{ + my $h = make_har( + make_entry(url => 'https://x.com/a'), + make_entry(url => 'https://x.com/a'), + make_entry(url => 'https://x.com/b'), + ); + my $r = HAR::Extractor::Dedupe::dedupe($h); + is $r->{kept}, 2, 'dedupe keeps 2'; + is $r->{removed}, 1, 'dedupe removes 1'; +} + +# Slice +{ + my $h = make_har( + make_entry(url => 'https://x.com/a', started => '2025-01-01T00:00:00.000Z'), + make_entry(url => 'https://x.com/b', started => '2025-01-01T00:00:30.000Z'), + make_entry(url => 'https://x.com/c', started => '2025-01-01T00:01:00.000Z'), + ); + my $kept = HAR::Extractor::Slice::slice($h, + start => '2025-01-01T00:00:10.000Z', + end => '2025-01-01T00:00:45.000Z'); + is $kept, 1, 'one entry in window'; + is $h->{log}{entries}[0]{request}{url}, 'https://x.com/b', 'kept the middle one'; +} + +done_testing; diff --git a/t/23-special.t b/t/23-special.t new file mode 100644 index 0000000..f45c437 --- /dev/null +++ b/t/23-special.t @@ -0,0 +1,92 @@ +use v5.36; +use strict; +use warnings; +use Test::More; +use lib 't/lib'; +use TestHelpers qw(make_har make_entry); +use File::Temp qw(tempdir); +use HAR::Extractor::Special::GraphQL; +use HAR::Extractor::Special::WebSocket; +use HAR::Extractor::Special::Endpoints; + +# GraphQL: POST + GET +{ + my $har = make_har( + make_entry( + url => 'https://api/graphql', + method => 'POST', + request_body => { + mimeType => 'application/json', + text => '{"operationName":"GetUser","query":"query GetUser($id: ID!) { user(id: $id) { id name } }","variables":{"id":1}}', + }, + ), + make_entry( + url => 'https://api/graphql?query=' . _urlenc('mutation Hi { hi }'), + method => 'GET', + ), + make_entry( + url => 'https://api/graphql', + method => 'POST', + request_body => { + mimeType => 'application/json', + text => '[{"query":"{ a }"},{"query":"{ b }"}]', + }, + ), + ); + my $ops = HAR::Extractor::Special::GraphQL::extract($har); + is scalar @$ops, 4, 'extracted 4 operations (1 named + 1 mutation + 2 batch)'; + my %kinds; $kinds{$_->{kind}}++ for @$ops; + is $kinds{query}, 3, '3 queries'; + is $kinds{mutation}, 1, '1 mutation'; + + my ($named) = grep { $_->{operation} eq 'GetUser' } @$ops; + ok $named, 'named operation found'; + is $named->{variables}{id}, 1, 'variables parsed'; + + my $tmp = tempdir(CLEANUP => 1); + my $n = HAR::Extractor::Special::GraphQL::write_files($ops, $tmp); + is $n, 4, 'wrote 4 files'; + ok -d "$tmp/query", 'query/ dir'; + ok -d "$tmp/mutation", 'mutation/ dir'; + ok -f "$tmp/query/GetUser.graphql", 'GetUser.graphql exists'; + ok -f "$tmp/query/GetUser.graphql.vars.json", 'variables saved'; +} + +# WebSocket +{ + my $har = make_har( + make_entry( + url => 'wss://chat/ws', + ws_messages => [ + { type => 'send', time => 0.0, opcode => 1, data => 'hello' }, + { type => 'receive', time => 0.1, opcode => 1, data => 'world' }, + ], + ), + ); + my $f = HAR::Extractor::Special::WebSocket::extract($har); + is scalar @$f, 2, 'two frames'; + is $f->[0]{type}, 'send', 'send first'; + is $f->[1]{type}, 'receive', 'receive second'; +} + +# Endpoints with normalization +{ + my $har = make_har( + make_entry(url => 'https://api.x.com/v1/users/42'), + make_entry(url => 'https://api.x.com/v1/users/99', method => 'GET', status => 200), + make_entry(url => 'https://api.x.com/v1/users/abc-12345678-1234-1234-1234-1234567890ab'), + make_entry(url => 'https://api.x.com/v1/health', method => 'GET'), + ); + my $eps = HAR::Extractor::Special::Endpoints::extract($har); + is scalar @$eps, 3, '3 unique endpoints'; + my @templates = map { $_->{template} } @$eps; + ok grep({ $_ eq '/v1/users/:id' } @templates), 'numeric id normalized'; + ok grep({ $_ eq '/v1/health' } @templates), 'health passthrough'; +} + +sub _urlenc ($s) { + $s =~ s/([^A-Za-z0-9])/sprintf '%%%02X', ord $1/ge; + return $s; +} + +done_testing; diff --git a/t/24-cli-extract.t b/t/24-cli-extract.t new file mode 100644 index 0000000..15b3c95 --- /dev/null +++ b/t/24-cli-extract.t @@ -0,0 +1,76 @@ +use strict; +use warnings; +use Test::More; +use lib 't/lib'; +use TestHelpers qw(make_har make_entry write_har_to_file); +use File::Temp qw(tempdir); +use File::Spec; +use FindBin qw($Bin); +use JSON::PP; +use MIME::Base64 qw(encode_base64); + +my $bin = File::Spec->rel2abs("$Bin/../bin/harx"); +my $lib = File::Spec->rel2abs("$Bin/../lib"); + +# Build a HAR with mixed content +my $har = make_har( + make_entry(url => 'https://a.com/p.png', + body => encode_base64("\x89PNG\r\n\x1A\n" . ('A' x 50), ''), + encoding => 'base64', mime => 'image/png'), + make_entry(url => 'https://a.com/data.json', mime => 'application/json', + body => '{"x":1}'), + make_entry(url => 'https://b.com/style.css', mime => 'text/css', + body => 'body{}'), +); +my $har_path = write_har_to_file($har); + +# CLI extract with filters: only domain a.com → 2 files +{ + my $tmp = tempdir(CLEANUP => 1); + my $rc = system("$^X -I$lib $bin extract -i $har_path -o $tmp --domain a.com >/dev/null 2>&1"); + is $rc, 0, 'extract succeeded'; + my @files = (); + sub _walk { my ($d) = @_; opendir my $dh, $d or return; while (my $f = readdir $dh) { next if $f =~ /^\./; my $p = "$d/$f"; if (-d $p) { _walk($p) } else { push @files, $p } } } + _walk($tmp); + my @body_files = grep { $_ !~ /manifest\.json$/ } @files; + is scalar(@body_files), 2, 'two body files for a.com'; + ok -f "$tmp/manifest.json", 'manifest.json present'; +} + +# CLI status filter +{ + my $har2 = make_har( + make_entry(url => 'https://x.com/a', status => 200, mime => 'text/plain', body => 'ok'), + make_entry(url => 'https://x.com/b', status => 404, mime => 'text/plain', body => 'no'), + ); + my $p2 = write_har_to_file($har2); + my $tmp = tempdir(CLEANUP => 1); + my $rc = system("$^X -I$lib $bin extract -i $p2 -o $tmp --status 4xx >/dev/null 2>&1"); + is $rc, 0, '--status filter ok'; + my @files; + sub _walk2 { my ($d) = @_; opendir my $dh, $d or return; while (my $f = readdir $dh) { next if $f =~ /^\./; my $p = "$d/$f"; if (-d $p) { _walk2($p) } else { push @files, $p } } } + _walk2($tmp); + is scalar(grep { !/manifest/ } @files), 1, 'only one file kept (4xx)'; +} + +# Extract with no matches → exit 3 +{ + my $tmp = tempdir(CLEANUP => 1); + my $rc = system("$^X -I$lib $bin extract -i $har_path -o $tmp --domain doesnotexist >/dev/null 2>&1"); + is $rc >> 8, 3, 'no matches yields rc=3'; +} + +# CLI validate: ok and bad +{ + my $rc = system("$^X -I$lib $bin validate $har_path >/dev/null 2>&1"); + is $rc, 0, 'validate ok'; + + my $tmp = tempdir(CLEANUP => 1); + my $bad = "$tmp/bad.har"; + open my $fh, '>', $bad or die $!; + print $fh '{not json'; close $fh; + $rc = system("$^X -I$lib $bin validate $bad >/dev/null 2>&1"); + isnt $rc, 0, 'validate fails on bad file'; +} + +done_testing; diff --git a/t/25-cli-other.t b/t/25-cli-other.t new file mode 100644 index 0000000..72d7274 --- /dev/null +++ b/t/25-cli-other.t @@ -0,0 +1,109 @@ +use strict; +use warnings; +use Test::More; +use lib 't/lib'; +use TestHelpers qw(make_har make_entry write_har_to_file); +use File::Temp qw(tempdir); +use File::Spec; +use FindBin qw($Bin); +use JSON::PP; + +my $bin = File::Spec->rel2abs("$Bin/../bin/harx"); +my $lib = File::Spec->rel2abs("$Bin/../lib"); + +my $har = make_har( + make_entry(url => 'https://api/users/1', method => 'GET', status => 200, mime => 'application/json', body => '{"name":"a"}'), + make_entry(url => 'https://api/users/2', method => 'GET', status => 200, mime => 'application/json', body => '{"name":"b"}'), + make_entry(url => 'https://api/users', method => 'POST', status => 201, mime => 'application/json', body => '{"id":3}'), +); +my $har_path = write_har_to_file($har); + +# audit secrets (no secrets) +{ + my $out = qx{$^X -I$lib $bin audit secrets $har_path 2>/dev/null}; + like $out, qr/No secrets/, 'audit secrets clean'; +} + +# audit headers (text) +{ + my $rc = system("$^X -I$lib $bin audit headers $har_path >/dev/null 2>&1"); + is $rc, 0, 'audit headers ok'; +} + +# convert curl --combined +{ + my $tmp = tempdir(CLEANUP => 1); + my $out = "$tmp/replay.sh"; + my $rc = system("$^X -I$lib $bin convert curl --combined -i $har_path -o $out >/dev/null 2>&1"); + is $rc, 0, 'convert curl --combined ok'; + ok -f $out, 'script exists'; + open my $fh, '<', $out or die $!; + my $content = do { local $/; <$fh> }; + close $fh; + like $content, qr/^#!/, 'shebang'; + like $content, qr/curl/, 'curl invocation'; + # Verify bash syntax + my $rc2 = system("bash -n $out 2>/dev/null"); + is $rc2, 0, 'script is valid bash syntax'; +} + +# convert postman +{ + my $tmp = tempdir(CLEANUP => 1); + my $out = "$tmp/c.json"; + my $rc = system("$^X -I$lib $bin convert postman -i $har_path -o $out >/dev/null 2>&1"); + is $rc, 0, 'convert postman ok'; + ok -f $out, 'collection exists'; + open my $fh, '<:raw', $out or die $!; local $/; my $j = <$fh>; close $fh; + my $col = decode_json($j); + is scalar @{ $col->{item} }, 3, 'collection has 3 items'; +} + +# stats +{ + my $rc = system("$^X -I$lib $bin stats -i $har_path >/dev/null 2>&1"); + is $rc, 0, 'stats ok'; +} + +# report html +{ + my $tmp = tempdir(CLEANUP => 1); + my $out = "$tmp/r.html"; + my $rc = system("$^X -I$lib $bin report html -i $har_path -o $out >/dev/null 2>&1"); + is $rc, 0, 'report html ok'; + ok -f $out, 'html report exists'; +} + +# endpoints +{ + my $rc = system("$^X -I$lib $bin endpoints -i $har_path >/dev/null 2>&1"); + is $rc, 0, 'endpoints ok'; +} + +# sanitize +{ + my $tmp = tempdir(CLEANUP => 1); + my $out = "$tmp/s.har"; + my $rc = system("$^X -I$lib $bin sanitize -i $har_path -o $out >/dev/null 2>&1"); + is $rc, 0, 'sanitize ok'; + ok -f $out, 'sanitized HAR exists'; +} + +# diff +{ + my $har2 = make_har( + make_entry(url => 'https://api/users/1', method => 'GET', status => 200, mime => 'application/json'), + make_entry(url => 'https://api/users/9', method => 'GET', status => 200, mime => 'application/json'), + ); + my $p2 = write_har_to_file($har2); + my $rc = system("$^X -I$lib $bin diff $har_path $p2 >/dev/null 2>&1"); + is $rc >> 8, 1, 'diff returns 1 when there are differences'; +} + +# completions +{ + my $out = qx{$^X -I$lib $bin completions bash 2>/dev/null}; + like $out, qr/_harx_complete/, 'bash completions emitted'; +} + +done_testing; diff --git a/t/26-manifest.t b/t/26-manifest.t new file mode 100644 index 0000000..3bdef2e --- /dev/null +++ b/t/26-manifest.t @@ -0,0 +1,64 @@ +use strict; +use warnings; +use Test::More; +use lib 't/lib'; +use TestHelpers qw(make_entry); +use File::Temp qw(tempfile); +use JSON::PP; +use HAR::Extractor::Entry; +use HAR::Extractor::Manifest; + +my $entry = HAR::Extractor::Entry->new(make_entry( + url => 'https://x.com/a', method => 'POST', status => 201, + mime => 'application/json', time_ms => 250, + timings => { dns => 5, connect => 6, ssl => 7, wait => 50, receive => 8, send => 1 }, +), 0); + +my $rec = HAR::Extractor::Manifest::record($entry, + output_path => 'application/json/a.json', + sha256 => 'deadbeef', + body_size_decoded => 42, +); + +is $rec->{url}, 'https://x.com/a', 'url'; +is $rec->{method}, 'POST', 'method'; +is $rec->{status}, 201, 'status'; +is $rec->{sha256}, 'deadbeef', 'sha'; +is $rec->{timings}{wait}, 50, 'wait timing'; + +# JSON manifest +{ + my (undef, $jp) = tempfile(SUFFIX => '.json', UNLINK => 1); + HAR::Extractor::Manifest::write_json([ $rec ], $jp); + open my $fh, '<:raw', $jp or die $!; + local $/; + my $j = <$fh>; + close $fh; + my $arr = decode_json($j); + is scalar @$arr, 1, 'one record'; +} + +# CSV manifest +{ + my (undef, $cp) = tempfile(SUFFIX => '.csv', UNLINK => 1); + HAR::Extractor::Manifest::write_csv([ $rec ], $cp); + open my $cfh, '<:raw', $cp or die $!; + my $hdr = <$cfh>; + my $row = <$cfh>; + close $cfh; + like $hdr, qr/index,method/, 'csv header'; + like $row, qr/0,POST,201/, 'csv row'; +} + +# JSONL +{ + my (undef, $lp) = tempfile(SUFFIX => '.jsonl', UNLINK => 1); + HAR::Extractor::Manifest::write_jsonl([ $rec, $rec ], $lp); + open my $lfh, '<:raw', $lp or die $!; + my @lines = <$lfh>; + close $lfh; + is scalar(@lines), 2, 'jsonl has two lines'; + ok decode_json($lines[0]), 'each line valid JSON'; +} + +done_testing; diff --git a/t/27-decoder-brotli.t b/t/27-decoder-brotli.t new file mode 100644 index 0000000..e5bdcf5 --- /dev/null +++ b/t/27-decoder-brotli.t @@ -0,0 +1,38 @@ +use strict; +use warnings; +use Test::More; +use lib 't/lib'; +use HAR::Extractor::Decoder qw(decode_body have_brotli); + +my $has_perl = have_brotli(); +my $has_cli = -x '/usr/bin/brotli' || -x '/usr/local/bin/brotli'; + +if (!$has_perl && !$has_cli) { + plan skip_all => 'no brotli (Perl module nor CLI) available'; +} + +# Build a brotli-compressed payload using whichever is available. +my $payload = "compressed brotli content"; +my $compressed; + +if ($has_perl) { + require IO::Compress::Brotli; + $compressed = IO::Compress::Brotli::bro($payload); +} else { + require File::Temp; + my $f = File::Temp->new(SUFFIX => '.txt'); + binmode $f; print $f $payload; close $f; + $compressed = `brotli -c "$f"`; + plan skip_all => 'brotli compress failed' if $? != 0 || !length $compressed; +} + +require MIME::Base64; +my $b64 = MIME::Base64::encode_base64($compressed, ''); +my $r = decode_body({ + text => $b64, + encoding => 'base64', + mimeType => 'text/plain', +}, content_encoding => 'br'); +is $r->{text}, $payload, 'brotli decoded'; + +done_testing; diff --git a/t/lib/TestHelpers.pm b/t/lib/TestHelpers.pm new file mode 100644 index 0000000..d9d7334 --- /dev/null +++ b/t/lib/TestHelpers.pm @@ -0,0 +1,105 @@ +package TestHelpers; +use v5.36; +use strict; +use warnings; +use Exporter 'import'; +use JSON::PP; +use MIME::Base64 qw(encode_base64); +use File::Temp qw(tempdir tempfile); +use IO::Compress::Gzip qw(gzip); + +our @EXPORT_OK = qw( + make_har make_entry write_har_to_file + tmpdir gz_b64 +); + +# Build a minimal valid HAR with the given entries. +sub make_har (@entries) { + return { + log => { + version => '1.2', + creator => { name => 'test', version => '1.0' }, + pages => [], + entries => [ @entries ], + }, + }; +} + +# Build a single entry. Opts: +# url, method, status, mime, body (text or bytes), encoding ('base64'|undef), +# request_headers, response_headers, content_encoding, started, time_ms, +# request_body (hash with text/mimeType), cookies_response. +sub make_entry (%o) { + my $headers_in = $o{request_headers} // []; + my $headers_out = $o{response_headers} // []; + if ($o{content_encoding}) { + push @$headers_out, { name => 'Content-Encoding', value => $o{content_encoding} }; + } + if ($o{mime}) { + push @$headers_out, { name => 'Content-Type', value => $o{mime} } + unless grep { lc($_->{name}) eq 'content-type' } @$headers_out; + } + return { + startedDateTime => $o{started} // '2025-01-01T00:00:00.000Z', + time => $o{time_ms} // 0, + request => { + method => $o{method} // 'GET', + url => $o{url} // 'https://example.com/', + httpVersion => $o{http_version} // 'HTTP/1.1', + headers => $headers_in, + queryString => $o{query_string} // [], + cookies => $o{request_cookies} // [], + (defined $o{request_body} ? (postData => $o{request_body}) : ()), + headersSize => -1, + bodySize => -1, + }, + response => { + status => $o{status} // 200, + statusText => $o{status_text} // 'OK', + httpVersion => $o{http_version} // 'HTTP/1.1', + headers => $headers_out, + cookies => $o{cookies_response} // [], + content => { + size => length($o{body} // ''), + mimeType => $o{mime} // 'text/plain', + (defined $o{encoding} ? (encoding => $o{encoding}) : ()), + (defined $o{body} ? (text => $o{body}) : ()), + }, + redirectURL => $o{redirect_url} // '', + headersSize => -1, + bodySize => -1, + }, + cache => {}, + timings => $o{timings} // { send => 0, wait => 0, receive => 0 }, + ($o{from_cache} ? (_fromCache => JSON::PP::true) : ()), + ($o{ws_messages} ? (_webSocketMessages => $o{ws_messages}) : ()), + }; +} + +sub write_har_to_file ($har, $path = undef) { + my $j = JSON::PP->new->utf8->canonical->pretty; + my ($fh, $filename); + if ($path) { + open $fh, '>:raw', $path or die "open $path: $!"; + $filename = $path; + } else { + ($fh, $filename) = tempfile(SUFFIX => '.har', UNLINK => 1); + binmode $fh; + } + print $fh $j->encode($har); + close $fh; + return $filename; +} + +sub tmpdir { + return tempdir(CLEANUP => 1); +} + +# Convenience: gzip+b64 a string for HAR-encoded bodies. +sub gz_b64 ($text) { + my $gz; + gzip(\$text => \$gz) or die "gzip: $IO::Compress::Gzip::GzipError"; + return encode_base64($gz, ''); +} + +1;