Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
98 changes: 74 additions & 24 deletions internal/site/league_parser.go
Original file line number Diff line number Diff line change
Expand Up @@ -161,43 +161,93 @@ func parseFixturesTable(table *goquery.Selection) []Fixture {
fixtures := make([]Fixture, 0, 16)

table.Find("tr").Each(func(_ int, row *goquery.Selection) {
matchLinks := row.Find("a[href*='mecz.php']")
if matchLinks.Length() != 1 {
fixture, ok := parseFixtureRow(row)
if !ok {
return
}

fixtures = append(fixtures, fixture)
})

return fixtures
}

func parseFixtureRow(row *goquery.Selection) (Fixture, bool) {
tds := row.Find("td")
if tds.Length() < 3 {
return Fixture{}, false
}

scoreCell, scoreIdx, matchLink, ok := fixtureScoreCell(row, tds)
if !ok {
return Fixture{}, false
}

home, _ := nearestTeamCellText(tds, scoreIdx-1, -1)
away, awayIdx := nearestTeamCellText(tds, scoreIdx+1, 1)
if home == "" || away == "" || isScoreLikeText(home) || isScoreLikeText(away) {
return Fixture{}, false
}

score := normalizeWhitespace(scoreCell.Text())
if !isFixtureScoreText(score) {
return Fixture{}, false
}

return Fixture{
Home: home,
Away: away,
Score: score,
WhenInfo: joinNonEmptyCells(tds, awayIdx+1),
MatchURL: matchLink,
MatchID: extractMatchID(matchLink),
}, true
}

func fixtureScoreCell(row *goquery.Selection, tds *goquery.Selection) (*goquery.Selection, int, string, bool) {
matchLinks := row.Find("a[href*='mecz.php']")
if matchLinks.Length() > 1 {
return nil, -1, "", false
}
if matchLinks.Length() == 1 {
scoreCell := matchLinks.First().Closest("td")
if scoreCell.Length() == 0 {
return
return nil, -1, "", false
}

tds := row.Find("td")
scoreIdx := scoreCell.Index()
home, _ := nearestTeamCellText(tds, scoreIdx-1, -1)
away, awayIdx := nearestTeamCellText(tds, scoreIdx+1, 1)
if home == "" || away == "" || isScoreLikeText(home) || isScoreLikeText(away) {
return
matchLink := strings.TrimSpace(matchLinks.First().AttrOr("href", ""))
if matchLink == "" {
return nil, -1, "", false
}

matchLink := strings.TrimSpace(scoreCell.Find("a[href*='mecz.php']").First().AttrOr("href", ""))
score := normalizeWhitespace(scoreCell.Text())
whenInfo := joinNonEmptyCells(tds, awayIdx+1)
return scoreCell, scoreCell.Index(), matchLink, true
}

if score == "" || matchLink == "" {
return
scoreIdx := -1
for idx := 0; idx < tds.Length(); idx++ {
if !isFixtureScoreText(tds.Eq(idx).Text()) {
continue
}
home, _ := nearestTeamCellText(tds, idx-1, -1)
away, _ := nearestTeamCellText(tds, idx+1, 1)
if home == "" || away == "" || isScoreLikeText(home) || isScoreLikeText(away) {
continue
}
if scoreIdx >= 0 {
return nil, -1, "", false
}
scoreIdx = idx
}
if scoreIdx < 0 {
return nil, -1, "", false
}

fixtures = append(fixtures, Fixture{
Home: home,
Away: away,
Score: score,
WhenInfo: whenInfo,
MatchURL: matchLink,
MatchID: extractMatchID(matchLink),
})
})
return tds.Eq(scoreIdx), scoreIdx, "", true
}

return fixtures
func isFixtureScoreText(text string) bool {
cleaned := normalizeWhitespace(text)
return cleaned == "-" || isScoreLikeText(cleaned)
}

func parseStandings(doc *goquery.Document) []StandingRow {
Expand Down
6 changes: 6 additions & 0 deletions internal/site/models.go
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,12 @@ type Competition struct {
LeagueKey string
}

type CompetitionMenu struct {
Title string
URL string
Items []Competition
}

type Fixture struct {
Home string
Away string
Expand Down
95 changes: 95 additions & 0 deletions internal/site/parser_archive_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -83,17 +83,29 @@ func TestParseSeasonsAndCompetitionsFromArchiveFixtures(t *testing.T) {
ekstraklasaURL := c.Resolve("/liga/1/liga11233.html")
iLigaURL := c.Resolve("/liga/1/liga11234.html")
iiLigaURL := c.Resolve("/liga/1/liga11235.html")
iiiLigaSelectorURL := c.Resolve("/ligireg.php?poziom=4&id_sezon=97")
regionalneURL := c.Resolve("/ligireg.php?id_sezon=97")
regionalCupsURL := c.Resolve("/polcups.php?id_sezon=97")

ekstraklasaIdx := competitionIndexByURL(competitions, ekstraklasaURL)
iLigaIdx := competitionIndexByURL(competitions, iLigaURL)
iiLigaIdx := competitionIndexByURL(competitions, iiLigaURL)
iiiLigaSelectorIdx := competitionIndexByURL(competitions, iiiLigaSelectorURL)
regionalneIdx := competitionIndexByURL(competitions, regionalneURL)
regionalCupsIdx := competitionIndexByURL(competitions, regionalCupsURL)

if ekstraklasaIdx < 0 || iLigaIdx < 0 || iiLigaIdx < 0 {
t.Fatalf("missing expected league links in 2020/21 archive")
}
if !(ekstraklasaIdx < iLigaIdx && iLigaIdx < iiLigaIdx) {
t.Fatalf("competition order broken: Ekstraklasa=%d I liga=%d II liga=%d", ekstraklasaIdx, iLigaIdx, iiLigaIdx)
}
if iiiLigaSelectorIdx < 0 || regionalneIdx < 0 {
t.Fatalf("missing III liga or ligi regionalne links in 2020/21 archive")
}
if regionalCupsIdx < 0 {
t.Fatalf("missing regional cups link in 2020/21 archive")
}
}

for _, season := range seasons {
Expand All @@ -117,3 +129,86 @@ func TestParseSeasonsAndCompetitionsFromArchiveFixtures(t *testing.T) {
t.Fatalf("expected at least one decoded Polish diacritic in archive fixtures")
}
}

func TestParseCompetitionMenuForIIILigaSelector(t *testing.T) {
html := `<html><body><table class="main"><tr><td valign="top"><p align="center"><b>III liga 2025/26</b></p><table class="main"><tr><td width="100"></td><td><a href="/liga/1/liga14154.html" class="main">I</a></td></tr><tr><td width="100"></td><td><a href="/liga/1/liga14155.html" class="main">II</a></td></tr></table></td></tr></table></body></html>`
doc, err := decodeAndParse([]byte(html), "text/html; charset=utf-8")
if err != nil {
t.Fatalf("parse synthetic HTML: %v", err)
}

menu := parseCompetitionMenu(doc, "http://www.90minut.pl/ligireg.php?poziom=4&id_sezon=107", NewClient())
if menu == nil {
t.Fatalf("expected III liga submenu")
}
if menu.Title != "III liga 2025/26" {
t.Fatalf("unexpected menu title: %q", menu.Title)
}
if len(menu.Items) != 2 || menu.Items[0].Name != "III liga 2025/26, gr. I" || menu.Items[1].Name != "III liga 2025/26, gr. II" {
t.Fatalf("unexpected III liga submenu items: %+v", menu.Items)
}
}

func TestParseCompetitionMenuForRegionalRoot(t *testing.T) {
html := `<html><body><table class="main"><tr><td valign="top"><p align="center"><b>Ligi regionalne 2025/26</b></p><a href="/ligireg-16.html" class="main">Dolnośląski ZPN</a><a href="/mecze_okreg.php?id_okreg=16" class="main">Dziś grają</a><a href="/ligireg-1.html" class="main">Kujawsko-Pomorski ZPN</a></td></tr></table></body></html>`
doc, err := decodeAndParse([]byte(html), "text/html; charset=utf-8")
if err != nil {
t.Fatalf("parse synthetic HTML: %v", err)
}

menu := parseCompetitionMenu(doc, "http://www.90minut.pl/ligireg.php?id_sezon=107", NewClient())
if menu == nil {
t.Fatalf("expected regional submenu")
}
if len(menu.Items) != 2 || menu.Items[0].Name != "Dolnośląski ZPN" || menu.Items[1].Name != "Kujawsko-Pomorski ZPN" {
t.Fatalf("unexpected regional submenu items: %+v", menu.Items)
}
}

func TestParseCompetitionMenuForRegionalRootWithAssociationQueryLinks(t *testing.T) {
html := `<html><body><table class="main"><tr><td valign="top"><p align="center"><b>Ligi regionalne 2025/26</b></p><a href="/ligireg.php?id_okreg=16&id_sezon=107" class="main">Dolnośląski ZPN</a><a href="/ligireg.php?id_okreg=8&id_sezon=107" class="main">Lubuski ZPN</a></td></tr></table></body></html>`
doc, err := decodeAndParse([]byte(html), "text/html; charset=utf-8")
if err != nil {
t.Fatalf("parse synthetic HTML: %v", err)
}

menu := parseCompetitionMenu(doc, "http://www.90minut.pl/ligireg.php?id_sezon=107", NewClient())
if menu == nil {
t.Fatalf("expected regional submenu")
}
if len(menu.Items) != 2 {
t.Fatalf("unexpected regional submenu items: %+v", menu.Items)
}
}

func TestParseCompetitionMenuForRegionalAssociationPage(t *testing.T) {
html := `<html><body><table class="main"><tr><td valign="top"><p align="center"><b>Ligi regionalne 2025/26 - Dolnośląski ZPN</b></p><a href="/liga/1/liga14169.html" class="main">IV liga 2025/2026, grupa: dolnośląska</a><a href="/liga/1/liga14204.html" class="main">Klasa okręgowa 2025/2026, grupa: Jelenia Góra</a></td></tr></table></body></html>`
doc, err := decodeAndParse([]byte(html), "text/html; charset=utf-8")
if err != nil {
t.Fatalf("parse synthetic HTML: %v", err)
}

menu := parseCompetitionMenu(doc, "http://www.90minut.pl/ligireg-16.html", NewClient())
if menu == nil {
t.Fatalf("expected regional association submenu")
}
if len(menu.Items) != 2 {
t.Fatalf("unexpected regional association item count: %d", len(menu.Items))
}
}

func TestParseCompetitionMenuForRegionalCupsPage(t *testing.T) {
html := `<html><body><table class="main"><tr><td valign="top"><p align="center"><b>Puchary krajowe 2025/26</b></p><a href="/liga/1/liga14076.html" class="main">Puchar Polski</a><a href="/liga/1/liga14075.html" class="main">Superpuchar Polski</a><a href="/liga/1/liga14636.html" class="main">Puchar Polski 2025/2026, grupa: Lubuski ZPN</a><a href="/liga/1/liga14069.html" class="main">Puchar Polski 2025/2026, grupa: Lubuski ZPN - Gorzów Wielkopolski</a></td></tr></table></body></html>`
doc, err := decodeAndParse([]byte(html), "text/html; charset=utf-8")
if err != nil {
t.Fatalf("parse synthetic HTML: %v", err)
}

menu := parseCompetitionMenu(doc, "http://www.90minut.pl/polcups.php?id_sezon=107", NewClient())
if menu == nil {
t.Fatalf("expected regional cups submenu")
}
if len(menu.Items) != 2 {
t.Fatalf("expected regional cups only, got %+v", menu.Items)
}
}
44 changes: 42 additions & 2 deletions internal/site/parser_league_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,15 +17,18 @@ func TestParseLeagueFixturesFromCorpus(t *testing.T) {
"league_14073": {firstRoundName: "Kolejka 1 - 19-20 lipca", firstRoundFixtures: 9},
}
leagues := fixturesByKind(m, "league")
if len(leagues) < 6 {
t.Fatalf("expected at least 6 league fixtures, got %d", len(leagues))
if len(leagues) < 7 {
t.Fatalf("expected at least 7 league fixtures, got %d", len(leagues))
}
if !containsFixtureName(leagues, "league_14072") {
t.Fatalf("expected league fixture for liga14072")
}
if !containsFixtureName(leagues, "league_14073") {
t.Fatalf("expected league fixture for liga14073")
}
if !containsFixtureName(leagues, "league_14141") {
t.Fatalf("expected league fixture for liga14141")
}

for _, fixture := range leagues {
fixture := fixture
Expand Down Expand Up @@ -61,6 +64,12 @@ func TestParseLeagueFixturesFromCorpus(t *testing.T) {
if isScoreLikeText(match.Home) || isScoreLikeText(match.Away) {
t.Fatalf("fixture side parsed as score token in %s: home=%q away=%q", fixture.Name, match.Home, match.Away)
}
if match.MatchURL == "" {
if match.MatchID != "" {
t.Fatalf("fixture without match url should keep empty match id in %s: %q", fixture.Name, match.MatchID)
}
continue
}
if !strings.Contains(match.MatchURL, "mecz.php") {
t.Fatalf("fixture match url is not a match link in %s: %q", fixture.Name, match.MatchURL)
}
Expand Down Expand Up @@ -102,6 +111,37 @@ func TestParseLeagueFixturesFromCorpus(t *testing.T) {
}
}

func TestParseLeagueFixturesWithoutMatchLinksFromCorpus(t *testing.T) {
doc, _ := fixtureDoc(t, "fixtures/league_14141.html")
page := parseLeaguePage(doc, "http://www.90minut.pl/liga/1/liga14141.html")
if page == nil {
t.Fatalf("expected league page")
}
if len(page.Rounds) == 0 {
t.Fatalf("expected rounds for linkless fixture league")
}

linklessFixtures := 0
for _, round := range page.Rounds {
for _, fixture := range round.Fixtures {
if fixture.MatchURL != "" {
continue
}
linklessFixtures++
if fixture.MatchID != "" {
t.Fatalf("expected empty match id for linkless fixture, got %q", fixture.MatchID)
}
if fixture.Home == "" || fixture.Away == "" || fixture.Score == "" {
t.Fatalf("expected sides and score for linkless fixture, got %+v", fixture)
}
}
}

if linklessFixtures == 0 {
t.Fatalf("expected at least one linkless fixture in liga14141")
}
}

func assertFixturesSortedByDate(t *testing.T, fixtureName string, round Round) {
t.Helper()

Expand Down
54 changes: 54 additions & 0 deletions internal/site/parser_regression_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -192,6 +192,60 @@ func TestParseFixturesTableSkipsRowsWithMultipleMatchLinks(t *testing.T) {
}
}

func TestParseFixturesTableFallsBackToPlainTextScoresWithoutMatchLinks(t *testing.T) {
html := `
<table>
<tr><td>Slask Wroclaw</td><td>-</td><td>Pogon Tczew</td><td>16 maja, 12:00</td></tr>
<tr><td colspan="4">w pierwotnym terminie odwolany</td></tr>
<tr><td><b>Gornik Leczna</b></td><td><b>3-0</b></td><td><b>UKS SMS Lodz</b></td><td>25 marca, 16:00</td></tr>
</table>`

doc, err := goquery.NewDocumentFromReader(strings.NewReader(html))
if err != nil {
t.Fatalf("parse synthetic HTML: %v", err)
}

fixtures := parseFixturesTable(doc.Find("table").First())
if len(fixtures) != 2 {
t.Fatalf("expected 2 fixtures, got %d", len(fixtures))
}
if fixtures[0].Home != "Slask Wroclaw" || fixtures[0].Away != "Pogon Tczew" || fixtures[0].Score != "-" || fixtures[0].WhenInfo != "16 maja, 12:00" {
t.Fatalf("unexpected plain-text fixture: %+v", fixtures[0])
}
if fixtures[0].MatchURL != "" || fixtures[0].MatchID != "" {
t.Fatalf("expected empty match details for plain-text score fixture: %+v", fixtures[0])
}
if fixtures[1].Home != "Gornik Leczna" || fixtures[1].Away != "UKS SMS Lodz" || fixtures[1].Score != "3-0" {
t.Fatalf("unexpected second plain-text fixture: %+v", fixtures[1])
}
}

func TestParseLeaguePageHandlesSavedAmbiguousLinklessFixture(t *testing.T) {
doc, _ := fixtureDoc(t, "fixtures/league_ambiguous_linkless.html")

page := parseLeaguePage(doc, "http://www.90minut.pl/liga/1/liga99998.html")
if page == nil {
t.Fatalf("expected league page")
}
if len(page.Rounds) != 1 {
t.Fatalf("expected 1 round, got %d", len(page.Rounds))
}
if len(page.Rounds[0].Fixtures) != 2 {
t.Fatalf("expected 2 fixtures, got %d", len(page.Rounds[0].Fixtures))
}

first := page.Rounds[0].Fixtures[0]
if first.Home != "Team A" || first.Away != "Team B" || first.Score != "1-0" {
t.Fatalf("unexpected first fixture: %+v", first)
}
if first.WhenInfo != "walkower 3-0 24 lipca, 18:00" {
t.Fatalf("unexpected first fixture metadata: %+v", first)
}
if first.MatchURL != "" || first.MatchID != "" {
t.Fatalf("expected linkless fixture, got %+v", first)
}
}

func TestRoundNameFromTableSkipsNavigationBlocks(t *testing.T) {
html := `
<table>
Expand Down
Loading