Skip to content

Commit

Permalink
FIX: directive with space before ":" gets ignored (ref. WWM-98)
Browse files Browse the repository at this point in the history
  • Loading branch information
kngenie committed Mar 28, 2014
1 parent 8e70e37 commit f89b47f
Showing 1 changed file with 14 additions and 8 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@
import java.util.List;
import java.util.logging.Level;
import java.util.logging.Logger;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.archive.wayback.util.ByteOp;
Expand All @@ -52,9 +53,9 @@ public class RobotRules {
*/
public static final String GLOBAL_USER_AGENT = "*";

protected static final Pattern USER_AGENT_PATTERN = Pattern.compile("(?i)^User-agent:.*");
protected static final Pattern DISALLOW_PATTERN = Pattern.compile("(?i)Disallow:.*");
protected static final Pattern ALLOW_PATTERN = Pattern.compile("(?i)Allow:.*");
protected static final Pattern USER_AGENT_PATTERN = Pattern.compile("(?i)^User-agent\\s*:(.*)");
protected static final Pattern DISALLOW_PATTERN = Pattern.compile("(?i)Disallow\\s*:(.*)");
protected static final Pattern ALLOW_PATTERN = Pattern.compile("(?i)Allow\\s*:(.*)");

private boolean bSyntaxErrors = false;
private HashMap<String, ArrayList<String>> rules =
Expand Down Expand Up @@ -112,8 +113,9 @@ public void parse(InputStream is) throws IOException {
read = read.substring(0, commentIndex);
}
read = read.trim();
if (USER_AGENT_PATTERN.matcher(read).matches()) {
String ua = read.substring(11).trim().toLowerCase();
Matcher uaMatcher = USER_AGENT_PATTERN.matcher(read);
if (uaMatcher.matches()) {
String ua = uaMatcher.group(1).trim().toLowerCase();
if (current == null || current.size() != 0 || allowRuleFound || !lastLineUA) {
// only create new rules-list if necessary
// otherwise share with previous user-agent
Expand All @@ -124,16 +126,20 @@ public void parse(InputStream is) throws IOException {
currLineUA = true;
LOGGER.fine("Found User-agent(" + ua + ") rules...");
continue;
} else if (DISALLOW_PATTERN.matcher(read).matches()) {
}
Matcher disallowMatcher = DISALLOW_PATTERN.matcher(read);
if (disallowMatcher.matches()) {
if (current == null) {
// buggy robots.txt
bSyntaxErrors = true;
continue;
}
String path = read.substring(9).trim();
String path = disallowMatcher.group(1).trim();
current.add(path);
continue;
} else if (ALLOW_PATTERN.matcher(read).matches()) {
}
Matcher allowMatcher = ALLOW_PATTERN.matcher(read);
if (allowMatcher.matches()) {
// Mark that there was an allow rule to clear the current list for next user-agent
allowRuleFound = true;
}
Expand Down

0 comments on commit f89b47f

Please sign in to comment.