Skip to content

Commit

Permalink
Improve robustness of PGN parsing
Browse files Browse the repository at this point in the history
Attempts to ensure any malformed PGN comment, or missing comment, produces an {unknown} comment in the processed version, without eating away at the move list. Three example pgns are included in a new UnitTests folder for this.

The first two PGNs contain intentionally damaged results -- either by missing fields, having wrong formats for the fields, or just outright having no comment at all. The third PGN is from self-play with Weiss, as an example of a proper output
  • Loading branch information
AndyGrant committed May 26, 2024
1 parent 809a21d commit 9f3fc11
Show file tree
Hide file tree
Showing 5 changed files with 685 additions and 10 deletions.
24 changes: 14 additions & 10 deletions Client/pgn_util.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,12 @@
import re
import sys

# For use externally
REGEX_COMMENT_VERBOSE = r'(book|[+-]?M?\d+(?:\.\d+)? \d+/\d+ \d+ \d+)'
REGEX_COMMENT_COMPACT = r'(book|[+-]?M?\d+(?:\.\d+)?) \d+/\d+ \d+ \d+'
REGEX_MOVE_AND_COMMENT = r'\s*(?:\d+\. )?([a-zA-Z0-9+=#-]+) (?:\s*\{\s*([^}]*)\s*\})?'
REGEX_GAME_RESULT = r'\s*(1-0|0-1|1/2-1/2|\*)'

def pgn_iterator(fname):
with open(fname) as pgn:
while True:
Expand Down Expand Up @@ -60,24 +66,22 @@ def pgn_strip_headers(headers, compact):

def pgn_strip_movelist(move_text, compact):

if not compact: # Captures Score Depth/SelDepth Time Nodes
comment_regex = r'(book|[+-]?M?\d+(?:\.\d+)? \d+/\d+ \d+ \d+)[^}]*'

else: # Captures Score and nothing else
comment_regex = r'(book|[+-]?M?\d+(?:\.\d+)?) \d+/\d+ \d+ \d+[^}]*'
# May parse book, otherwise Score for Compact, Score Depth/SelDepth Time Nodes for Verbose
comment_regex = re.compile(REGEX_COMMENT_COMPACT if compact else REGEX_COMMENT_VERBOSE)

# Captures the Move and Comment, discarding extra commentary and move numbers
one_ply_regex = re.compile(r'\s*(?:\d+\. )?([a-zA-Z0-9+=#-]+) (?:{%s})?' % (comment_regex))
# Parses the move number, the SAN, and an optional comment
one_ply_regex = re.compile(r'\s*(?:\d+\. )?([a-zA-Z0-9+=#-]+) (?:\s*\{\s*([^}]*)\s*\})?')

# Captures the trailing game result
result_regex = re.compile(r'\s*(1-0|0-1|1/2-1/2|\*)')

stripped = '' # Add each: <Move> {<Comment>}
for move, comment in one_ply_regex.findall(move_text):
stripped += '%s {%s} ' % (move, comment if comment else 'unknown')
for move, comment in re.compile(REGEX_MOVE_AND_COMMENT).findall(move_text):
match = re.search(comment_regex, comment)
stripped += '%s {%s} ' % (move, match.group() if match else 'unknown')

# PGNs expect trailing game result text
return stripped + result_regex.search(move_text).group(1)
return stripped + re.compile(REGEX_GAME_RESULT).search(move_text).group(1)

def strip_entire_pgn(file_name, scale_factor, compact):

Expand Down
84 changes: 84 additions & 0 deletions UnitTests/example1.pgn
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
[Event "?"]
[Site "?"]
[Date "2024.05.26"]
[Round "1"]
[White "Torch"]
[Black "Torch"]
[Result "1-0"]
[FEN "rnb1k1nr/pp1pbppp/8/q1p1p3/P4P2/R3PN2/1PPP2PP/1NBQKB1R w Kkq - 0 1"]
[GameDuration "00:00:04"]
[GameEndTime "2024-05-26T12:57:02.148 EDT"]
[GameStartTime "2024-05-26T12:56:57.601 EDT"]
[PlyCount "139"]
[SetUp "1"]
[TimeControl "inf"]

1. fxe5 {+4.23 12/14 75} d5 {-4.15 12/17 48 40000}
2. Nc3 {+3.70 12/17 40000} Be6 {-4.38 14/19 41 40000}
3. Bb5+ {+4.18 14 40 40000} Nc6 {-4.35 14/17 38 40000}
4. e4 {+4.10 17 40 40000} dxe4 {-3.98 14/21 39 40002}
5. Nxe4 {+3.96 13/15 40 40000} Nh6 {-4.20 14/23 36 40000}
6. O-O {+4.28 41 40000} Qb6 {-4.51 12/17 41 40000}
7. Nd6+ {12/17 41 40000} Bxd6 {-3.81 13/20 40 40000}
8. exd6 {436 13/19 41 40000} O-O-O {-3.89 13/18 39 40000}
9. d4 {+4.69 14/17 35 40000} Nxd4 {-3.97 12/18 40 40000}
10. Nxd4 {+4.41 13/18 38 40000} cxd4 {-4.09 13/17 36 40000}
11. h3 {+4 13/18 42 40000} Nf5 {-3.62 13/18 39 40000}
12. d7+ {+4.43 14/19 40 40001} Kb8 {-3.91 13/16 37 40000}
13. Qd3 {+4.12 12/18 36 40001} Ne3 {-3.91 14/22 34 40000}
14. Bxe3 {+5.07 15/23 32 40000} dxe3 {-5.90 15/23 33 40000}
15. a5 {+5.32 13/20 36 40000} Qc7 {-5.98 13/20 36 40000}
16. Qxe3 {+5.68 14/19 37 40000} a6 {-5.93 13/20 38 40002}
17. Ba4 {+6.14 14/20 37 40000} Ka8 {-6.14 13/19 38 40000}
18. Rf4 {+5.60 14/19 39 40000} Rhg8 {-5.94 12/18 37 40001}
19. Rf1 {+4.28 13/16 40 40001} g5 {-4.66 14/20 38 40000}
20. Qb6 {+3.63 14/24 37 40000} Qxb6+ {-3.52 16/21 34 40000}
21. axb6 {+4.35 16/25 33 40002} Bxd7 {-4.17 15/23 33 40000}
22. Rxf7 {+4.37 14/20 34 40000} Bxa4 {-3.97 15/21 30 40000}
23. Rxa4 {+4.24 15/22 32 40000} Rd6 {-3.85 14/18 33 40001}
24. Rb4 {+3.71 15/21 31 40000} h6 {-3.72 13/15 31 40000}
25. c4 {+2.92 14/17 32 40000} Rd4 {-2.68 15/17 34 40000}
26. Rf6 {+2.18 13/14 34 40000} Kb8 {-2.56 14/17 34 40002}
27. Rb3 {+3.34 15/20 32 40000} Rxc4 {-2.93 15/22 35 40000}
28. Rxh6 {+5.84 15/18 32 40001} Rc6 {-2.19 16/25 32 40000}
29. Rxc6 {+7.27 16/22 31 40002} bxc6 {-2.19 14/15 35 40002}
30. Rg3 {+7.90 15/19 26 40000} Kb7 {-7.07 13/20 35 40001}
31. h4 {+8.10 15/20 28 40001} g4 {-7.38 14/18 31 40000}
32. Kh2 {+8.30 13/14 30 40000} a5 {-7.79 13/14 32 40000}
33. Rb3 {+8.78 14/19 30 40001} c5 {-8.12 13/16 31 40002}
34. Rb5 {+9.45 14/5 32 40001} Kc6 {-8.91 13/18 32 40001}
35. Rxa5 {+9.40 14/20 34 40001} Kxb6 {-9.75 12/18 32 40000}
36. Ra1 {+9.77 15/20 33 40000} g3+ {-10.13 13/17 32 40000}
37. Kh3 {+10.06 15/19 34 40000} Kc6 {-10.70 13/15 35 40001}
38. Rc1 {+10.48 15/17 31 40000} Kd5 {-10.75 13/19 33 40000}
39. b3 {+11.46 14/13 29 40000} Re8 {-11.41 12/15 31 40000}
40. Rd1+ {+12.66 15/17 30 40000} Kc6 {-12.90 13/16 34 40001}
41. Rd3 {+13.29 13/14 30 40000} Re2 {-12.95 12/12 34 40000}
42. Kxg3 {+13.85 14/16 32 40000} Kb5 {-13.77 13/18 35 40001}
43. h5 {+14.23 14/13 31 40000} Kb4 {-14.39 12/17 31 40000}
44. Kh2 {+15.02 13/14 28 40000} Re8 {-14.73 12/13 31 40001}
45. Rh3 {+15.39 15/19 29 40000} c4 {-15.21 13/13 35 40001}
46. bxc4 {+15.80 14/19 30 40000} Kxc4 {-15.35 15/24 32 40000}
47. g4 {+16.17 13/13 26 40000} Kd5 {-15.95 11/13 31 40000}
48. Rf3 {+16.60 13/15 27 40000} Ke4 {-16.43 12/14 29 40000}
49. Kg3 {+16.95 12/15 26 40000} Re6 {-17.21 12/14 31 40000}
50. Rf5 {+18.09 14/15 22 40000} Rh6 {-18.72 11/14 30 40001}
51. Kh4 {+18.99 13/14 21 40000} Rh7 {-19.14 11/17 25 40001}
52. Rf1 {+19.70 12/13 22 40001} Rh8
53. g5 {+21.12 12/18 31 40000} Kd3
54. h6 {+21.65 13/17 28 40001} Re8
55. h7 {+22.79 12/16 29 40001} Ke2
56. Rf6 {+23.44 12/17 27 40000} Rb8
57. g6 {+25.41 13/19 35 40001} Rb1 {-24.83 13/16 34 40000}
58. Kg5 {+26.55 12/21 31 40007} Rh1 {-26.82 12/17 36 40000}
59. Rf4 {+27.35 13/18 32 40000} Rg1+ {-27.39 14/20 34 40000}
60. Rg4 {+28.67 13/21 33 40000} Rh1 {-29.83 14/17 34 40000}
61. Rh4 {+28.79 13/17 28 40000} Rg1+ {-33.19 13/17 30 40001}
62. Kh6 {+32.81 12/16 30 40000} Kf2 {-40.18 13/14 34 40000}
63. Rh5 {+36.19 12/18 33 40000} Ke2 {-36.84 12/16 32 40000}
64. g7 {+38.23 12/14 32 40000} Rxg7 {-39.91 13/18 33 40000}
65. Kxg7 {+40.41 14/20 26 40000} Kd3 {-44.12 14/19 27 40000}
66. Kg8 {+49.68 14/22 22 40000} Kc2 {-M54 16/25 18 40000}
67. Rh3 {+M9 23/10 20 40000} Kc1 {-M50 12/8 6 40001} 68. Rh2 {+M5 31/6 6 40000}
Kb1 {-M4 96/5 4 36036} 69. h8=Q {+M3 96/4 5 35602} Kc1 { 0/1 8 40000}
70. Qa1# {+M1 96/2 0 2516, White mates} 1-0
84 changes: 84 additions & 0 deletions UnitTests/example2.pgn
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
[Event "?"]
[Site "?"]
[Date "2024.05.26"]
[Round "1"]
[White "Torch"]
[Black "Torch"]
[Result "1-0"]
[FEN "rnb1k1nr/pp1pbppp/8/q1p1p3/P4P2/R3PN2/1PPP2PP/1NBQKB1R w Kkq - 0 1"]
[GameDuration "00:00:04"]
[GameEndTime "2024-05-26T12:57:30.145 EDT"]
[GameStartTime "2024-05-26T12:57:25.472 EDT"]
[PlyCount "139"]
[SetUp "1"]
[TimeControl "inf"]

1. fxe5 {+4.23 12/16 42 39485} d5 {-4.15 12/17 43 38473}
2. Nc3 {+3.83 12/17 40 34560} Be6 {-4.30 13/22 39 36002}
3. Bb5+ {+4.18 14/14 38 38924} Nc6 {-4.35 14/18 37 34854}
4. e4 {+4.18 13/17 42 28288} dxe4 {-3.98 14/21 39 32608}
5. Nxe4 {+3.96 13/12 40 29723} Nh6 {-4.20 14/23 36 35760}
6. O-O {+4.28 12/17 41 35389} Qb6 {-4.26 12/17 41 35735}
7. Nd6+ {+3.50 12/17 42 37697} Bxd6 {-3.81 13/18 41 38274}
8. exd6 {+4.36 13/19 42 38645} O-O-O {-3.89 13/18 40 39048}
9. d4 {+4.69 14/17 36 39006} Nxd4 {-3.97 12/17 40 34214}
10. Nxd4 {+4.41 13/18 39 36786} cxd4 {-4.09 13/17 39 34846}
11. h3 {+4.22 12/17 43 17157} Nf5 {-3.41 13/13 41 34143}
12. d7+ {+4.43 14/17 43 33165} Kb8 {-3.91 13/20 40 39120}
13. Qd3 {+4.04 12/18 39 37958} Ne3 {-3.91 14/22 38 39962}
14. Bxe3 {+5.07 15/23 35 39261} dxe3 {-5.90 15/20 37 37377}
15. a5 {+5.32 13/20 38 39290} Qc7 {-5.13 13/20 37 31953}
16. Qxe3 {+5.68 14/17 36 34560} a6 {-5.93 13/20 38 35655}
17. Ba4 {+6.14 14/20 39 33272} Ka8 {-6.14 13/17 40 39677}
18. Rf4 {+5.60 14/19 43 39488} Rhg8 {-5.46 12/18 42 33699}
19. Rf1 {+4.60 13/14 43 31456} g5 {-4.41 14/20 42 36233}
20. Qb6 {+3.63 14/26 41 36607} Qxb6+ {-3.52 16/19 38 37826}
21. axb6 {+4.35 16/26 39 39192} Bxd7 {-4.17 15/23 38 38646}
22. Rxf7 {+4.37 14/19 35 39040} Bxa4 {-3.97 15/21 32 37259}
23. Rxa4 {+4.24 15/21 34 38881} Rd6 {-3.85 14/18 35 36911}
24. Rb4 {+3.71 15/21 32 39209} h6 {-3.72 13/20 33 34852}
25. c4 {+3.69 14/13 35 26237} Rd4 {-2.68 15/14 37 33902}
26. Rf6 {+2.39 13/13 35 39084} Kb8 {-2.26 13/18 36 37450}
27. Rb3 {+3.34 15/20 34 37710} Rxc4 {-2.93 15/20 38 37205}
28. Rxh6 {+5.84 15/15 35 36844} Rc6 {-2.19 16/17 37 37014}
29. Rxc6 {+7.27 16/18 34 33813} bxc6 {-2.19 14/26 37 35564}
30. Rg3 {+7.90 15/22 28 37624} Kb7 {-7.07 13/19 40 38243}
31. h4 {+8.10 15/17 32 33634} g4 {-7.38 14/15 33 37494}
32. Kh2 {+8.30 13/14 31 37907} a5 {-7.79 13/14 33 39481}
33. Rb3 {+8.78 14/19 29 36379} c5 {-8.02 13/13 32 38286}
34. Rb5 {+9.45 14/19 34 39981} Kc6 {-8.91 13/18 35 35241}
35. Rxa5 {+9.40 14/20 36 35616} Kxb6 {-9.75 12/15 33 36732}
36. Ra1 {+9.80 15/18 33 39301} g3+ {-10.15 12/18 34 30587}
37. Kh3 {+10.06 15/17 34 33600} Kc6 {-10.70 13/12 36 35603}
38. Rc1 {+10.72 15/16 32 36443} Kd5 {-10.75 13/19 34 39733}
39. b3 {+11.46 14/13 30 33084} Re8 {-11.41 12/15 32 35252}
40. Rd1+ {+12.66 15/13 32 35486} Kc6 {-12.90 13/16 35 36189}
41. Rd3 {+13.29 13/14 31 36697} Re2 {-12.98 11/15 33 33955}
42. Kxg3 {+13.85 14/15 32 32920} Kb5 {-13.77 13/16 36 36298}
43. h5 {+14.23 14/13 32 34421} Kb4 {-14.47 11/17 32 36183}
44. Kh2 {+15.02 13/18 28 32803} Re8 {-14.66 11/12 32 37600}
45. Rh3 {+15.39 15/19 29 35793} c4 {-15.29 12/19 35 36843}
46. bxc4 {+15.80 14/19 29 37057} Kxc4 {-15.35 15/20 31 36507}
47. g4 {+16.17 13/13 26 35516} Kd5 {-15.76 11/12 32 33904}
48. Rf3 {+16.60 13/14 29 32251} Ke4 {-16.43 12/14 31 35541}
49. Kg3 {+16.95 12/15 27 28203} Re6 {-16.96 12/14 31 35219}
50. Rf5 {+18.09 14/15 22 32595} Rh6 {-18.03 11/14 30 20617}
51. Kh4 {+18.99 13/14 20 39864} Rh7 {-18.99 11/17 25 29705}
52. Rf1 {+19.70 12/15 21 39612} Rh8 {-19.75 10/12 24 37115}
53. g5 {+21.12 12/15 31 25717} Kd3 {-21.16 12/15 31 31037}
54. h6 {+21.65 13/17 30 39824} Re8 {-21.76 11/14 30 25120}
55. h7 {+22.79 12/14 30 35242} Ke2 {-22.47 12/14 31 18311}
56. Rf6 {+23.44 12/13 28 24765} Rb8 {-23.70 13/18 33 37862}
57. g6 {+25.41 13/18 34 23189} Rb1 {-24.83 13/16 33 38798}
58. Kg5 {+26.55 12/17 31 29398} Rh1 {-25.17 11/13 36 5389}
59. Rf4 {+27.35 13/21 33 30750} Rg1+ {-27.39 14/17 36 33050}
60. Rg4 {+28.67 13/19 34 32747} Rh1 {-29.83 14/14 35 34343}
61. Rh4 {+28.79 13/17 29 33671} Rg1+ {-29.30 13/17 30 33229}
62. Kh6 {+32.81 12/16 31 26657} Kf2 {-29.33 13/8 34 35604}
63. Rh5 {+36.19 12/14 33 28525} Ke2 {-36.84 12/16 33 27852}
64. g7 {+38.23 12/18 33 39867} Rxg7 {-37.88 12/12 34 24254}
65. Kxg7 {+40.41 14/19 26 29276} Kd3 {-44.12 14/17 27 31734}
66. Kg8 {+49.68 14/22 21 35430} Kc2 {-52.79 15/21 17 27360}
67. Rh3 {+M9 22/10 20 38234} Kc1 {-M50 12/17 6 39103} 68. Rh2 {+M5 30/6 6 37836}
Kb1 {-M4 96/5 9 36036} 69. h8=Q {+M3 96/4 10 35602} Kc1
70. Qa1# {+M1 96/2 3 2516, White mates} 1-0
Loading

0 comments on commit 9f3fc11

Please sign in to comment.