From f89b47f6dbeda2be8dd32f84c6e1a6f34fbd4390 Mon Sep 17 00:00:00 2001 From: Kenji Nagahashi Date: Fri, 28 Mar 2014 13:52:34 -0700 Subject: [PATCH] FIX: directive with space before ":" gets ignored (ref. WWM-98) --- .../accesscontrol/robotstxt/RobotRules.java | 22 ++++++++++++------- 1 file changed, 14 insertions(+), 8 deletions(-) diff --git a/wayback-core/src/main/java/org/archive/wayback/accesscontrol/robotstxt/RobotRules.java b/wayback-core/src/main/java/org/archive/wayback/accesscontrol/robotstxt/RobotRules.java index 5a2327a5ab..3b7653b4bb 100644 --- a/wayback-core/src/main/java/org/archive/wayback/accesscontrol/robotstxt/RobotRules.java +++ b/wayback-core/src/main/java/org/archive/wayback/accesscontrol/robotstxt/RobotRules.java @@ -30,6 +30,7 @@ import java.util.List; import java.util.logging.Level; import java.util.logging.Logger; +import java.util.regex.Matcher; import java.util.regex.Pattern; import org.archive.wayback.util.ByteOp; @@ -52,9 +53,9 @@ public class RobotRules { */ public static final String GLOBAL_USER_AGENT = "*"; - protected static final Pattern USER_AGENT_PATTERN = Pattern.compile("(?i)^User-agent:.*"); - protected static final Pattern DISALLOW_PATTERN = Pattern.compile("(?i)Disallow:.*"); - protected static final Pattern ALLOW_PATTERN = Pattern.compile("(?i)Allow:.*"); + protected static final Pattern USER_AGENT_PATTERN = Pattern.compile("(?i)^User-agent\\s*:(.*)"); + protected static final Pattern DISALLOW_PATTERN = Pattern.compile("(?i)Disallow\\s*:(.*)"); + protected static final Pattern ALLOW_PATTERN = Pattern.compile("(?i)Allow\\s*:(.*)"); private boolean bSyntaxErrors = false; private HashMap> rules = @@ -112,8 +113,9 @@ public void parse(InputStream is) throws IOException { read = read.substring(0, commentIndex); } read = read.trim(); - if (USER_AGENT_PATTERN.matcher(read).matches()) { - String ua = read.substring(11).trim().toLowerCase(); + Matcher uaMatcher = USER_AGENT_PATTERN.matcher(read); + if (uaMatcher.matches()) { + String ua = uaMatcher.group(1).trim().toLowerCase(); if (current == null || current.size() != 0 || allowRuleFound || !lastLineUA) { // only create new rules-list if necessary // otherwise share with previous user-agent @@ -124,16 +126,20 @@ public void parse(InputStream is) throws IOException { currLineUA = true; LOGGER.fine("Found User-agent(" + ua + ") rules..."); continue; - } else if (DISALLOW_PATTERN.matcher(read).matches()) { + } + Matcher disallowMatcher = DISALLOW_PATTERN.matcher(read); + if (disallowMatcher.matches()) { if (current == null) { // buggy robots.txt bSyntaxErrors = true; continue; } - String path = read.substring(9).trim(); + String path = disallowMatcher.group(1).trim(); current.add(path); continue; - } else if (ALLOW_PATTERN.matcher(read).matches()) { + } + Matcher allowMatcher = ALLOW_PATTERN.matcher(read); + if (allowMatcher.matches()) { // Mark that there was an allow rule to clear the current list for next user-agent allowRuleFound = true; }