diff --git a/src/main/java/com/soulgalore/crawler/core/CrawlerResult.java b/src/main/java/com/soulgalore/crawler/core/CrawlerResult.java index c36e409..56a2349 100644 --- a/src/main/java/com/soulgalore/crawler/core/CrawlerResult.java +++ b/src/main/java/com/soulgalore/crawler/core/CrawlerResult.java @@ -41,6 +41,7 @@ public class CrawlerResult { * * @param theStartPoint where the crawl was started * @param theUrls the urls that was fetched + * @param theVerifiedResponses the verified responses * @param theNonWorkingResponses the non working urls */ public CrawlerResult(String theStartPoint, Set theUrls, diff --git a/src/main/java/com/soulgalore/crawler/core/HTMLPageResponse.java b/src/main/java/com/soulgalore/crawler/core/HTMLPageResponse.java index 35e8c46..38e48cd 100644 --- a/src/main/java/com/soulgalore/crawler/core/HTMLPageResponse.java +++ b/src/main/java/com/soulgalore/crawler/core/HTMLPageResponse.java @@ -51,6 +51,7 @@ public class HTMLPageResponse { * @param theBody the body * @param theEncoding the encoding * @param theSize the size + * @param theResponseType the response mime type * @param theFetchTime the time it took to fetch the response */ public HTMLPageResponse(CrawlerURL pageUrl, int theResponseCode, Map theHeaders, diff --git a/src/main/java/com/soulgalore/crawler/core/HTMLPageResponseCallable.java b/src/main/java/com/soulgalore/crawler/core/HTMLPageResponseCallable.java index 24761fb..6386389 100644 --- a/src/main/java/com/soulgalore/crawler/core/HTMLPageResponseCallable.java +++ b/src/main/java/com/soulgalore/crawler/core/HTMLPageResponseCallable.java @@ -45,6 +45,8 @@ public class HTMLPageResponseCallable implements Callable { * @param theUrl the url to call. * @param theFetcher the fetcher to use * @param fetchTheBody if true, the response body is fetched, else not. + * @param theRequestHeaders request headers to add + * @param followRedirectsToNewDomain if true, follow redirects that lead to a different domain. */ public HTMLPageResponseCallable(CrawlerURL theUrl, HTMLPageResponseFetcher theFetcher, boolean fetchTheBody, Map theRequestHeaders, boolean followRedirectsToNewDomain) { diff --git a/src/main/java/com/soulgalore/crawler/core/HTMLPageResponseFetcher.java b/src/main/java/com/soulgalore/crawler/core/HTMLPageResponseFetcher.java index 2822db5..f0f3804 100644 --- a/src/main/java/com/soulgalore/crawler/core/HTMLPageResponseFetcher.java +++ b/src/main/java/com/soulgalore/crawler/core/HTMLPageResponseFetcher.java @@ -34,9 +34,13 @@ public interface HTMLPageResponseFetcher { * * @param url the url to fetch * @param fetchBody fetch the body or not + * @param requestHeaders request headers to add + * @param followRedirectsToNewDomain if true, follow redirects that lead to a different domain. * @return the response */ - HTMLPageResponse get(CrawlerURL url, boolean fetchBody, Map requestHeaders, boolean followRedirectsToNewDomain); + HTMLPageResponse get(CrawlerURL url, boolean fetchBody, + Map requestHeaders, + boolean followRedirectsToNewDomain); /** diff --git a/src/main/java/com/soulgalore/crawler/core/assets/AssetsVerifier.java b/src/main/java/com/soulgalore/crawler/core/assets/AssetsVerifier.java index 79f2bf8..2e1b04f 100644 --- a/src/main/java/com/soulgalore/crawler/core/assets/AssetsVerifier.java +++ b/src/main/java/com/soulgalore/crawler/core/assets/AssetsVerifier.java @@ -31,8 +31,9 @@ public interface AssetsVerifier { /** * Verify that all the assets work (=return 200) for the working urls in the result. * - * @param responses - * @return + * @param responses responses to verify + * @param configuration configuration to verify against + * @return result of the verification */ AssetsVerificationResult verify(Set responses, CrawlerConfiguration configuration); diff --git a/src/main/java/com/soulgalore/crawler/core/impl/DefaultCrawler.java b/src/main/java/com/soulgalore/crawler/core/impl/DefaultCrawler.java index d123d90..7a9b55b 100644 --- a/src/main/java/com/soulgalore/crawler/core/impl/DefaultCrawler.java +++ b/src/main/java/com/soulgalore/crawler/core/impl/DefaultCrawler.java @@ -161,6 +161,7 @@ public CrawlerResult getUrls(CrawlerConfiguration configuration) { * @param responses holding bodys where we should fetch the links. * @param allUrls every url we have fetched so far * @param nonWorkingUrls the urls that didn't work to fetch + * @param verifiedUrls responses that are already verified * @param host the host we are working on * @param onlyOnPath only fetch files that match the following path. If empty, all will match. * @param notOnPath don't collect/follow urls that contains this text in the url diff --git a/src/main/java/com/soulgalore/crawler/core/impl/HTTPClientResponseFetcher.java b/src/main/java/com/soulgalore/crawler/core/impl/HTTPClientResponseFetcher.java index 6d7179a..85cdda9 100644 --- a/src/main/java/com/soulgalore/crawler/core/impl/HTTPClientResponseFetcher.java +++ b/src/main/java/com/soulgalore/crawler/core/impl/HTTPClientResponseFetcher.java @@ -68,20 +68,10 @@ public HTTPClientResponseFetcher(HttpClient client) { httpClient = client; } - /** - * Shutdown the client. - */ public void shutdown() { httpClient.getConnectionManager().shutdown(); } - /** - * Get a response. - * - * @param url the url - * @param getPage the body of the page or not - * @return the response - */ public HTMLPageResponse get(CrawlerURL url, boolean getPage, Map requestHeaders, boolean followRedirectsToNewDomain) { if (url.isWrongSyntax()) { diff --git a/src/main/java/com/soulgalore/crawler/util/HTTPSFaker.java b/src/main/java/com/soulgalore/crawler/util/HTTPSFaker.java index 9178ed8..820e316 100644 --- a/src/main/java/com/soulgalore/crawler/util/HTTPSFaker.java +++ b/src/main/java/com/soulgalore/crawler/util/HTTPSFaker.java @@ -54,7 +54,8 @@ private HTTPSFaker() {} /** * Get a HttpClient that accept any HTTP certificate. - * + * + * @param cm the connection manager to use when creating the new HttpClient * @return a httpClient that accept any HTTP certificate */ @SuppressWarnings("deprecation")