From 92b0701fb53660aa48c8a8ba83e9b14fac69dbed Mon Sep 17 00:00:00 2001 From: Byoungchan Lee Date: Mon, 20 Oct 2025 07:18:02 +0900 Subject: [PATCH] Add support for excluding domains and locations in metalink parsing This commit introduces two new options, LRO_METALINK_EXCLUDE_DOMAIN and LRO_METALINK_EXCLUDE_LOCATION, to the LrHandle structure. These options allow users to specify lists of domains and locations to exclude during metalink processing. --- librepo/handle.c | 9 +++ librepo/handle.h.in | 6 ++ librepo/handle_internal.h | 6 ++ librepo/metalink.c | 105 +++++++++++++++++++++++++++++- librepo/metalink.h | 4 ++ librepo/python/__init__.py | 22 +++++++ librepo/python/handle-py.c | 2 + librepo/python/librepomodule.c | 2 + librepo/xmlparser_internal.h | 5 ++ tests/test_metalink.c | 114 ++++++++++++++++++++++++++++++--- 10 files changed, 265 insertions(+), 10 deletions(-) diff --git a/librepo/handle.c b/librepo/handle.c index 8a48ccf4a..69b933a7f 100644 --- a/librepo/handle.c +++ b/librepo/handle.c @@ -173,6 +173,8 @@ lr_handle_free(LrHandle *handle) lr_lrmirrorlist_free(handle->metalink_mirrors); lr_lrmirrorlist_free(handle->mirrors); lr_metalink_free(handle->metalink); + lr_handle_free_list(&handle->metalink_exclude_domain); + lr_handle_free_list(&handle->metalink_exclude_location); lr_handle_free_list(&handle->yumdlist); lr_urlvars_free(handle->yumslist); lr_handle_free_list(&handle->yumblist); @@ -536,6 +538,8 @@ lr_handle_setopt(LrHandle *handle, case LRO_URLS: case LRO_YUMDLIST: case LRO_YUMBLIST: + case LRO_METALINK_EXCLUDE_DOMAIN: + case LRO_METALINK_EXCLUDE_LOCATION: { int size = 0; char **list = va_arg(arg, char **); @@ -548,6 +552,10 @@ lr_handle_setopt(LrHandle *handle, handle_list = &handle->yumdlist; } else if (option == LRO_YUMBLIST) { handle_list = &handle->yumblist; + } else if (option == LRO_METALINK_EXCLUDE_DOMAIN) { + handle_list = &handle->metalink_exclude_domain; + } else if (option == LRO_METALINK_EXCLUDE_LOCATION) { + handle_list = &handle->metalink_exclude_location; } lr_handle_free_list(handle_list); @@ -1212,6 +1220,7 @@ lr_handle_prepare_metalink(LrHandle *handle, gchar *localpath, GError **err) LrMetalink *ml = lr_metalink_init(); gboolean ret = lr_metalink_parse_file(ml, + handle, fd, metalink_file, lr_xml_parser_warning_logger, diff --git a/librepo/handle.h.in b/librepo/handle.h.in index eb242dded..114211074 100644 --- a/librepo/handle.h.in +++ b/librepo/handle.h.in @@ -440,6 +440,12 @@ typedef enum { LRO_PASSWORD, /*!< (char *) Password for HTTP authentication */ + LRO_METALINK_EXCLUDE_DOMAIN, /*!< (char ** NULL-terminated) + List of domains to exclude from metalink */ + + LRO_METALINK_EXCLUDE_LOCATION, /*!< (char ** NULL-terminated) + List of locations to exclude from metalink */ + LRO_SENTINEL, /*!< Sentinel */ } LrHandleOption; /*!< Handle config options */ diff --git a/librepo/handle_internal.h b/librepo/handle_internal.h index 11c87594a..e37cf1394 100644 --- a/librepo/handle_internal.h +++ b/librepo/handle_internal.h @@ -94,6 +94,12 @@ struct _LrHandle { LrMetalink *metalink; /*!< Parsed metalink for repomd.xml */ + char **metalink_exclude_domain; /*!< + List of domains to exclude from metalink */ + + char **metalink_exclude_location; /*!< + List of locations to exclude from metalink */ + LrInternalMirrorlist *mirrors; /*!< Mirrors from metalink or mirrorlist */ diff --git a/librepo/metalink.c b/librepo/metalink.c index 1f839a9a1..a2ed10f8a 100644 --- a/librepo/metalink.c +++ b/librepo/metalink.c @@ -30,6 +30,7 @@ #include "util.h" #include "metalink.h" #include "xmlparser_internal.h" +#include "handle_internal.h" /** TODO: * - (?) Use GStringChunk @@ -311,8 +312,44 @@ lr_metalink_start_handler(void *pdata, const xmlChar *xmlElement, const xmlChar assert(!pd->metalinkurl); assert(!pd->metalinkhash); + pd->skip_url = 0; + const char *val; assert(!pd->metalinkurl); + + if ((val = lr_find_attr("location", attr))) { + if (pd->handle && pd->handle->metalink_exclude_location) { + for (int i = 0; pd->handle->metalink_exclude_location[i]; i++) { + const char *pattern = pd->handle->metalink_exclude_location[i]; + GError *regex_err = NULL; + GRegex *regex = g_regex_new(pattern, + G_REGEX_OPTIMIZE, + 0, + ®ex_err); + if (regex) { + gboolean matches = g_regex_match(regex, val, 0, NULL); + g_regex_unref(regex); + if (matches) { + pd->skip_url = 1; + return; + } + } else { + // Invalid regex, treat as literal string + g_warning("%s: Invalid regex for metalink location exclusion \"%s\": %s", + __func__, pattern, + regex_err ? regex_err->message : "unknown error"); + if (regex_err) + g_error_free(regex_err); + + if (g_strcmp0(val, pattern) == 0) { + pd->skip_url = 1; + return; + } + } + } + } + } + LrMetalinkUrl *url = lr_new_metalinkurl(pd->metalink); if ((val = lr_find_attr("protocol", attr))) url->protocol = g_strdup(val); @@ -447,11 +484,75 @@ lr_metalink_end_handler(void *pdata, G_GNUC_UNUSED const xmlChar *element) break; case STATE_URL: + if (pd->skip_url) { + pd->skip_url = 0; + break; + } + assert(pd->metalink); assert(pd->metalinkurl); assert(!pd->metalinkhash); - pd->metalinkurl->url = g_strdup(pd->content); + // Check domain filtering before setting URL + if (pd->handle && pd->handle->metalink_exclude_domain && pd->content) { + GError *uri_err = NULL; + GUri *uri = g_uri_parse(pd->content, G_URI_FLAGS_NONE, &uri_err); + if (uri) { + const char *host = g_uri_get_host(uri); + gboolean excluded = FALSE; + if (host) { + for (int i = 0; pd->handle->metalink_exclude_domain[i]; i++) { + const char *pattern = pd->handle->metalink_exclude_domain[i]; + GError *regex_err = NULL; + GRegex *regex = g_regex_new(pattern, + G_REGEX_OPTIMIZE, + 0, + ®ex_err); + if (regex) { + gboolean matches = g_regex_match(regex, host, 0, NULL); + g_regex_unref(regex); + if (matches) { + excluded = TRUE; + break; + } + } else { + // Invalid regex, treat as literal string + g_warning("%s: Invalid regex for metalink domain exclusion \"%s\": %s", + __func__, pattern, + regex_err ? regex_err->message : "unknown error"); + if (regex_err) + g_error_free(regex_err); + + if (g_strcmp0(host, pattern) == 0) { + excluded = TRUE; + break; + } + } + } + } + g_uri_unref(uri); + + if (excluded) { + // remove last url + GSList *last = g_slist_last(pd->metalink->urls); + lr_free_metalinkurl(last->data); + pd->metalink->urls = g_slist_delete_link(pd->metalink->urls, last); + pd->metalinkurl = NULL; + } + } else { + g_debug("%s: Unable to parse URL \"%s\" for domain exclusion: %s", + __func__, pd->content, + uri_err ? uri_err->message : "unknown error"); + if (uri_err) + g_error_free(uri_err); + } + } + + // If URL was excluded above, metalinkurl will be NULL + if (pd->metalinkurl) { + pd->metalinkurl->url = g_strdup(pd->content); + } + pd->metalinkurl = NULL; break; @@ -464,6 +565,7 @@ lr_metalink_end_handler(void *pdata, G_GNUC_UNUSED const xmlChar *element) gboolean lr_metalink_parse_file(LrMetalink *metalink, + LrHandle *handle, int fd, const char *filename, LrXmlParserWarningCb warningcb, @@ -491,6 +593,7 @@ lr_metalink_parse_file(LrMetalink *metalink, pd->parser = &parser; pd->state = STATE_START; pd->metalink = metalink; + pd->handle = handle; pd->filename = (char *) filename; pd->ignore = 1; pd->found = 0; diff --git a/librepo/metalink.h b/librepo/metalink.h index c959096ae..29f9f9093 100644 --- a/librepo/metalink.h +++ b/librepo/metalink.h @@ -23,6 +23,7 @@ #include #include +#include G_BEGIN_DECLS @@ -71,6 +72,8 @@ lr_metalink_init(void); /** Parse metalink file. * @param metalink Metalink object. + * @param handle LrHandle or NULL. If LrHandle is provided, it's used + * for filtering metalink data (e.g. by location or domain). * @param fd File descriptor. * @param filename File to look for in metalink file. * @param warningcb ::LrXmlParserWarningCb function or NULL @@ -80,6 +83,7 @@ lr_metalink_init(void); */ gboolean lr_metalink_parse_file(LrMetalink *metalink, + LrHandle *handle, int fd, const char *filename, LrXmlParserWarningCb warningcb, diff --git a/librepo/python/__init__.py b/librepo/python/__init__.py index a005e7034..225c5fca0 100644 --- a/librepo/python/__init__.py +++ b/librepo/python/__init__.py @@ -102,6 +102,20 @@ *String or None*. Set password for HTTP authentication. +.. data:: LRO_METALINK_EXCLUDE_DOMAIN + + *List of strings*. List of regex patterns to exclude domains from metalink. + Each pattern is matched against the domain name in metalink URLs. If the pattern + is not a valid regex, it will be treated as a literal string match. + Example: ``["^mirror\\.example\\.com$"]`` + +.. data:: LRO_METALINK_EXCLUDE_LOCATION + + *List of strings*. List of regex patterns to exclude locations from metalink. + Each pattern is matched against the location attribute in metalink URLs. If the + pattern is not a valid regex, it will be treated as a literal string match. + Example: ``["^US$", "^(CA|MX)$"]`` + .. data:: LRO_PROXY *String or None*. Set proxy server address. Port could be @@ -1280,6 +1294,14 @@ class Handle(_librepo.Handle): See: :data:`.LRO_PASSWORD` + .. attribute:: metalink_exclude_domain: + + See: :data:`.LRO_METALINK_EXCLUDE_DOMAIN` + + .. attribute:: metalink_exclude_location: + + See: :data:`.LRO_METALINK_EXCLUDE_LOCATION` + .. attribute:: proxy: See: :data:`.LRO_PROXY` diff --git a/librepo/python/handle-py.c b/librepo/python/handle-py.c index ccc1ef427..e29284d34 100644 --- a/librepo/python/handle-py.c +++ b/librepo/python/handle-py.c @@ -540,6 +540,8 @@ py_setopt(_HandleObject *self, PyObject *args) case LRO_URLS: case LRO_YUMDLIST: case LRO_YUMBLIST: + case LRO_METALINK_EXCLUDE_DOMAIN: + case LRO_METALINK_EXCLUDE_LOCATION: case LRO_HTTPHEADER: { Py_ssize_t len = 0; diff --git a/librepo/python/librepomodule.c b/librepo/python/librepomodule.c index b7b1b5931..4d4800621 100644 --- a/librepo/python/librepomodule.c +++ b/librepo/python/librepomodule.c @@ -240,6 +240,8 @@ PyInit__librepo(void) PYMODULE_ADDINTCONSTANT(LRO_USERPWD); PYMODULE_ADDINTCONSTANT(LRO_USERNAME); PYMODULE_ADDINTCONSTANT(LRO_PASSWORD); + PYMODULE_ADDINTCONSTANT(LRO_METALINK_EXCLUDE_DOMAIN); + PYMODULE_ADDINTCONSTANT(LRO_METALINK_EXCLUDE_LOCATION); PYMODULE_ADDINTCONSTANT(LRO_PROXY); PYMODULE_ADDINTCONSTANT(LRO_PROXYPORT); PYMODULE_ADDINTCONSTANT(LRO_PROXYTYPE); diff --git a/librepo/xmlparser_internal.h b/librepo/xmlparser_internal.h index 25a48a532..9eac6ed66 100644 --- a/librepo/xmlparser_internal.h +++ b/librepo/xmlparser_internal.h @@ -29,6 +29,8 @@ G_BEGIN_DECLS +struct _LrHandle; + /** \defgroup xmlparser_internal Common stuff for XML parsers in Librepo (datatypes, etc.) * \addtogroup xmlparser_internal * @{ @@ -91,6 +93,8 @@ typedef struct { ignore all subelements of the current file element */ int found; /*!< wanted file was already parsed */ + int skip_url; /*!< + skip currently parsed url element */ LrMetalink *metalink; /*!< metalink object */ @@ -100,6 +104,7 @@ typedef struct { Hash in progress or NULL */ LrMetalinkAlternate *metalinkalternate; /*!< Alternate in progress or NULL */ + struct _LrHandle *handle; } LrParserData; diff --git a/tests/test_metalink.c b/tests/test_metalink.c index 1440125f6..c758f9b30 100644 --- a/tests/test_metalink.c +++ b/tests/test_metalink.c @@ -10,6 +10,7 @@ #include "librepo/types.h" #include "librepo/metalink.h" #include "librepo/util.h" +#include "librepo/handle.h" #define REPOMD "repomd.xml" #define METALINK_DIR "metalinks" @@ -52,7 +53,7 @@ START_TEST(test_metalink_good_01) ck_assert_int_ge(fd, 0); ml = lr_metalink_init(); ck_assert_ptr_nonnull(ml); - ret = lr_metalink_parse_file(ml, fd, REPOMD, NULL, NULL, &tmp_err); + ret = lr_metalink_parse_file(ml, NULL, fd, REPOMD, NULL, NULL, &tmp_err); ck_assert(ret); ck_assert_ptr_null(tmp_err); close(fd); @@ -164,7 +165,7 @@ START_TEST(test_metalink_good_02) ck_assert_int_ge(fd, 0); ml = lr_metalink_init(); ck_assert_ptr_nonnull(ml); - ret = lr_metalink_parse_file(ml, fd, REPOMD, NULL, NULL, &tmp_err); + ret = lr_metalink_parse_file(ml, NULL, fd, REPOMD, NULL, NULL, &tmp_err); ck_assert(ret); ck_assert_ptr_null(tmp_err); close(fd); @@ -210,7 +211,7 @@ START_TEST(test_metalink_good_03) ck_assert_int_ge(fd, 0); ml = lr_metalink_init(); ck_assert_ptr_nonnull(ml); - ret = lr_metalink_parse_file(ml, fd, REPOMD, NULL, NULL, &tmp_err); + ret = lr_metalink_parse_file(ml, NULL, fd, REPOMD, NULL, NULL, &tmp_err); ck_assert(ret); ck_assert_ptr_null(tmp_err); close(fd); @@ -255,7 +256,7 @@ START_TEST(test_metalink_bad_01) ml = lr_metalink_init(); ck_assert_ptr_nonnull(ml); int call_counter = 0; - ret = lr_metalink_parse_file(ml, fd, REPOMD, warning_cb, &call_counter, &tmp_err); + ret = lr_metalink_parse_file(ml, NULL, fd, REPOMD, warning_cb, &call_counter, &tmp_err); ck_assert(ret); ck_assert_ptr_null(tmp_err); ck_assert_int_gt(call_counter, 0); @@ -375,7 +376,7 @@ START_TEST(test_metalink_bad_02) ck_assert_int_ge(fd, 0); ml = lr_metalink_init(); ck_assert_ptr_nonnull(ml); - ret = lr_metalink_parse_file(ml, fd, REPOMD, NULL, NULL, &tmp_err); + ret = lr_metalink_parse_file(ml, NULL, fd, REPOMD, NULL, NULL, &tmp_err); ck_assert(ret); ck_assert_ptr_null(tmp_err); close(fd); @@ -399,7 +400,7 @@ START_TEST(test_metalink_really_bad_01) ck_assert_int_ge(fd, 0); ml = lr_metalink_init(); ck_assert_ptr_nonnull(ml); - ret = lr_metalink_parse_file(ml, fd, REPOMD, NULL, NULL, &tmp_err); + ret = lr_metalink_parse_file(ml, NULL, fd, REPOMD, NULL, NULL, &tmp_err); ck_assert(!ret); ck_assert_ptr_nonnull(tmp_err); g_error_free(tmp_err); @@ -423,7 +424,7 @@ START_TEST(test_metalink_really_bad_02) ck_assert_int_ge(fd, 0); ml = lr_metalink_init(); ck_assert_ptr_nonnull(ml); - ret = lr_metalink_parse_file(ml, fd, REPOMD, NULL, NULL, &tmp_err); + ret = lr_metalink_parse_file(ml, NULL, fd, REPOMD, NULL, NULL, &tmp_err); ck_assert(!ret); ck_assert_ptr_nonnull(tmp_err); g_error_free(tmp_err); @@ -447,7 +448,7 @@ START_TEST(test_metalink_really_bad_03) ck_assert_int_ge(fd, 0); ml = lr_metalink_init(); ck_assert_ptr_nonnull(ml); - ret = lr_metalink_parse_file(ml, fd, REPOMD, NULL, NULL, &tmp_err); + ret = lr_metalink_parse_file(ml, NULL, fd, REPOMD, NULL, NULL, &tmp_err); ck_assert(!ret); ck_assert_ptr_nonnull(tmp_err); g_error_free(tmp_err); @@ -474,7 +475,7 @@ START_TEST(test_metalink_with_alternates) ck_assert_int_ge(fd, 0); ml = lr_metalink_init(); ck_assert_ptr_nonnull(ml); - ret = lr_metalink_parse_file(ml, fd, REPOMD, NULL, NULL, &tmp_err); + ret = lr_metalink_parse_file(ml, NULL, fd, REPOMD, NULL, NULL, &tmp_err); ck_assert(ret); ck_assert_ptr_null(tmp_err); close(fd); @@ -511,6 +512,99 @@ START_TEST(test_metalink_with_alternates) } END_TEST +START_TEST(test_metalink_exclude_location) +{ + LrHandle *h = lr_handle_init(); + ck_assert_ptr_nonnull(h); + + // Test with regex patterns - exclude US and CA locations + char *exclude_locations[] = {"^(US|CA)$", NULL}; + gboolean ret_setopt = lr_handle_setopt(h, NULL, LRO_METALINK_EXCLUDE_LOCATION, exclude_locations); + ck_assert(ret_setopt); + + int fd; + gboolean ret; + char *path; + LrMetalink *ml = NULL; + GError *tmp_err = NULL; + + path = lr_pathconcat(test_globals.testdata_dir, METALINK_DIR, "metalink_good_01", NULL); + fd = open(path, O_RDONLY); + g_free(path); + ck_assert_int_ge(fd, 0); + + ml = lr_metalink_init(); + ck_assert_ptr_nonnull(ml); + + ret = lr_metalink_parse_file(ml, h, fd, REPOMD, NULL, NULL, &tmp_err); + ck_assert(ret); + ck_assert_ptr_null(tmp_err); + close(fd); + + for (GSList *elem = ml->urls; elem != NULL; elem = elem->next) { + LrMetalinkUrl *mlurl = elem->data; + ck_assert_ptr_nonnull(mlurl); + if (mlurl->location) { + ck_assert_str_ne(mlurl->location, "US"); + ck_assert_str_ne(mlurl->location, "CA"); + } + } + + lr_metalink_free(ml); + lr_handle_free(h); +} +END_TEST + +START_TEST(test_metalink_exclude_domain) +{ + LrHandle *h = lr_handle_init(); + ck_assert_ptr_nonnull(h); + + // Test with regex patterns - exclude specific domains + char *exclude_domains[] = {"^(mirror\\.pnl\\.gov|mirrors\\.syringanetworks\\.net)$", NULL}; + gboolean ret_setopt = lr_handle_setopt(h, NULL, LRO_METALINK_EXCLUDE_DOMAIN, exclude_domains); + ck_assert(ret_setopt); + + int fd; + gboolean ret; + char *path; + LrMetalink *ml = NULL; + GError *tmp_err = NULL; + + path = lr_pathconcat(test_globals.testdata_dir, METALINK_DIR, "metalink_good_01", NULL); + fd = open(path, O_RDONLY); + g_free(path); + ck_assert_int_ge(fd, 0); + + ml = lr_metalink_init(); + ck_assert_ptr_nonnull(ml); + + ret = lr_metalink_parse_file(ml, h, fd, REPOMD, NULL, NULL, &tmp_err); + ck_assert(ret); + ck_assert_ptr_null(tmp_err); + close(fd); + + for (GSList *elem = ml->urls; elem != NULL; elem = elem->next) { + LrMetalinkUrl *mlurl = elem->data; + ck_assert_ptr_nonnull(mlurl); + if (mlurl->url) { + GUri *uri = g_uri_parse(mlurl->url, G_URI_FLAGS_NONE, NULL); + if (uri) { + const char *host = g_uri_get_host(uri); + if (host) { + ck_assert_str_ne(host, "mirror.pnl.gov"); + ck_assert_str_ne(host, "mirrors.syringanetworks.net"); + } + g_uri_unref(uri); + } + } + } + + lr_metalink_free(ml); + lr_handle_free(h); +} +END_TEST + Suite * metalink_suite(void) { @@ -526,6 +620,8 @@ metalink_suite(void) tcase_add_test(tc, test_metalink_really_bad_02); tcase_add_test(tc, test_metalink_really_bad_03); tcase_add_test(tc, test_metalink_with_alternates); + tcase_add_test(tc, test_metalink_exclude_location); + tcase_add_test(tc, test_metalink_exclude_domain); suite_add_tcase(s, tc); return s; }