-
Notifications
You must be signed in to change notification settings - Fork 96
Add support for excluding domains and locations in metalink parsing #357
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: master
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -440,6 +440,12 @@ typedef enum { | |
| LRO_PASSWORD, /*!< (char *) | ||
| Password for HTTP authentication */ | ||
|
|
||
| LRO_METALINK_EXCLUDE_DOMAIN, /*!< (char ** NULL-terminated) | ||
| List of domains to exclude from metalink */ | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Also please append to the documentation since which version is this option available. |
||
|
|
||
| LRO_METALINK_EXCLUDE_LOCATION, /*!< (char ** NULL-terminated) | ||
| List of locations to exclude from metalink */ | ||
|
|
||
| LRO_SENTINEL, /*!< Sentinel */ | ||
|
|
||
| } LrHandleOption; /*!< Handle config options */ | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -30,6 +30,7 @@ | |
| #include "util.h" | ||
| #include "metalink.h" | ||
| #include "xmlparser_internal.h" | ||
| #include "handle_internal.h" | ||
|
|
||
| /** TODO: | ||
| * - (?) Use GStringChunk | ||
|
|
@@ -311,8 +312,44 @@ lr_metalink_start_handler(void *pdata, const xmlChar *xmlElement, const xmlChar | |
| assert(!pd->metalinkurl); | ||
| assert(!pd->metalinkhash); | ||
|
|
||
| pd->skip_url = 0; | ||
|
|
||
| const char *val; | ||
| assert(!pd->metalinkurl); | ||
|
|
||
| if ((val = lr_find_attr("location", attr))) { | ||
| if (pd->handle && pd->handle->metalink_exclude_location) { | ||
| for (int i = 0; pd->handle->metalink_exclude_location[i]; i++) { | ||
| const char *pattern = pd->handle->metalink_exclude_location[i]; | ||
| GError *regex_err = NULL; | ||
| GRegex *regex = g_regex_new(pattern, | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why do you compile and validate the expression here deep in the XML parser? Best place is in lr_handle_setopt(). |
||
| G_REGEX_OPTIMIZE, | ||
| 0, | ||
| ®ex_err); | ||
| if (regex) { | ||
| gboolean matches = g_regex_match(regex, val, 0, NULL); | ||
| g_regex_unref(regex); | ||
| if (matches) { | ||
| pd->skip_url = 1; | ||
| return; | ||
| } | ||
| } else { | ||
| // Invalid regex, treat as literal string | ||
| g_warning("%s: Invalid regex for metalink location exclusion \"%s\": %s", | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Please spell "regex" in full as "regular expression". This is not Perl. |
||
| __func__, pattern, | ||
| regex_err ? regex_err->message : "unknown error"); | ||
| if (regex_err) | ||
| g_error_free(regex_err); | ||
|
|
||
| if (g_strcmp0(val, pattern) == 0) { | ||
| pd->skip_url = 1; | ||
| return; | ||
| } | ||
| } | ||
| } | ||
| } | ||
| } | ||
|
|
||
| LrMetalinkUrl *url = lr_new_metalinkurl(pd->metalink); | ||
| if ((val = lr_find_attr("protocol", attr))) | ||
| url->protocol = g_strdup(val); | ||
|
|
@@ -447,11 +484,75 @@ lr_metalink_end_handler(void *pdata, G_GNUC_UNUSED const xmlChar *element) | |
| break; | ||
|
|
||
| case STATE_URL: | ||
| if (pd->skip_url) { | ||
| pd->skip_url = 0; | ||
| break; | ||
| } | ||
|
|
||
| assert(pd->metalink); | ||
| assert(pd->metalinkurl); | ||
| assert(!pd->metalinkhash); | ||
|
|
||
| pd->metalinkurl->url = g_strdup(pd->content); | ||
| // Check domain filtering before setting URL | ||
| if (pd->handle && pd->handle->metalink_exclude_domain && pd->content) { | ||
| GError *uri_err = NULL; | ||
| GUri *uri = g_uri_parse(pd->content, G_URI_FLAGS_NONE, &uri_err); | ||
| if (uri) { | ||
| const char *host = g_uri_get_host(uri); | ||
| gboolean excluded = FALSE; | ||
| if (host) { | ||
| for (int i = 0; pd->handle->metalink_exclude_domain[i]; i++) { | ||
| const char *pattern = pd->handle->metalink_exclude_domain[i]; | ||
| GError *regex_err = NULL; | ||
| GRegex *regex = g_regex_new(pattern, | ||
| G_REGEX_OPTIMIZE, | ||
| 0, | ||
| ®ex_err); | ||
| if (regex) { | ||
| gboolean matches = g_regex_match(regex, host, 0, NULL); | ||
| g_regex_unref(regex); | ||
| if (matches) { | ||
| excluded = TRUE; | ||
| break; | ||
| } | ||
| } else { | ||
| // Invalid regex, treat as literal string | ||
| g_warning("%s: Invalid regex for metalink domain exclusion \"%s\": %s", | ||
| __func__, pattern, | ||
| regex_err ? regex_err->message : "unknown error"); | ||
| if (regex_err) | ||
| g_error_free(regex_err); | ||
|
|
||
| if (g_strcmp0(host, pattern) == 0) { | ||
| excluded = TRUE; | ||
| break; | ||
| } | ||
| } | ||
| } | ||
| } | ||
| g_uri_unref(uri); | ||
|
|
||
| if (excluded) { | ||
| // remove last url | ||
| GSList *last = g_slist_last(pd->metalink->urls); | ||
| lr_free_metalinkurl(last->data); | ||
| pd->metalink->urls = g_slist_delete_link(pd->metalink->urls, last); | ||
| pd->metalinkurl = NULL; | ||
| } | ||
| } else { | ||
| g_debug("%s: Unable to parse URL \"%s\" for domain exclusion: %s", | ||
| __func__, pd->content, | ||
| uri_err ? uri_err->message : "unknown error"); | ||
| if (uri_err) | ||
| g_error_free(uri_err); | ||
| } | ||
| } | ||
|
|
||
| // If URL was excluded above, metalinkurl will be NULL | ||
| if (pd->metalinkurl) { | ||
| pd->metalinkurl->url = g_strdup(pd->content); | ||
| } | ||
|
|
||
| pd->metalinkurl = NULL; | ||
| break; | ||
|
|
||
|
|
@@ -464,6 +565,7 @@ lr_metalink_end_handler(void *pdata, G_GNUC_UNUSED const xmlChar *element) | |
|
|
||
| gboolean | ||
| lr_metalink_parse_file(LrMetalink *metalink, | ||
| LrHandle *handle, | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. lr_metalink_parse_file() is public functio. This breaks API and ABI. I think we need to wind a different way of passing the exclusion options. If there is not context or object to store the options to, I recommend adding a new function and making this old as a single wrapper around it. |
||
| int fd, | ||
| const char *filename, | ||
| LrXmlParserWarningCb warningcb, | ||
|
|
@@ -491,6 +593,7 @@ lr_metalink_parse_file(LrMetalink *metalink, | |
| pd->parser = &parser; | ||
| pd->state = STATE_START; | ||
| pd->metalink = metalink; | ||
| pd->handle = handle; | ||
| pd->filename = (char *) filename; | ||
| pd->ignore = 1; | ||
| pd->found = 0; | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -102,6 +102,20 @@ | |
|
|
||
| *String or None*. Set password for HTTP authentication. | ||
|
|
||
| .. data:: LRO_METALINK_EXCLUDE_DOMAIN | ||
|
|
||
| *List of strings*. List of regex patterns to exclude domains from metalink. | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why is it a list? If it is a regular expression, then string is enough. |
||
| Each pattern is matched against the domain name in metalink URLs. If the pattern | ||
| is not a valid regex, it will be treated as a literal string match. | ||
| Example: ``["^mirror\\.example\\.com$"]`` | ||
|
|
||
| .. data:: LRO_METALINK_EXCLUDE_LOCATION | ||
|
|
||
| *List of strings*. List of regex patterns to exclude locations from metalink. | ||
| Each pattern is matched against the location attribute in metalink URLs. If the | ||
| pattern is not a valid regex, it will be treated as a literal string match. | ||
| Example: ``["^US$", "^(CA|MX)$"]`` | ||
|
|
||
| .. data:: LRO_PROXY | ||
|
|
||
| *String or None*. Set proxy server address. Port could be | ||
|
|
@@ -1280,6 +1294,14 @@ class Handle(_librepo.Handle): | |
|
|
||
| See: :data:`.LRO_PASSWORD` | ||
|
|
||
| .. attribute:: metalink_exclude_domain: | ||
|
|
||
| See: :data:`.LRO_METALINK_EXCLUDE_DOMAIN` | ||
|
|
||
| .. attribute:: metalink_exclude_location: | ||
|
|
||
| See: :data:`.LRO_METALINK_EXCLUDE_LOCATION` | ||
|
|
||
| .. attribute:: proxy: | ||
|
|
||
| See: :data:`.LRO_PROXY` | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This enhances API. You need to increase a minor version of librepo in VERSION.cmake.