diff --git a/.clang-format b/.clang-format new file mode 100644 index 0000000..3d6016a --- /dev/null +++ b/.clang-format @@ -0,0 +1,12 @@ +# Defines the Chromium style for automatic reformatting. +# http://clang.llvm.org/docs/ClangFormatStyleOptions.html +BasedOnStyle: Chromium +# This defaults to 'Auto'. Explicitly set it for a while, so that +# 'vector >' in existing files gets formatted to +# 'vector>'. ('Auto' means that clang-format will only use +# 'int>>' if the file already contains at least one such instance.) +Standard: c++17 +SortIncludes: CaseSensitive +--- +Language: ObjC +ColumnLimit: 100 \ No newline at end of file diff --git a/.clang-tidy b/.clang-tidy new file mode 100644 index 0000000..fd8c681 --- /dev/null +++ b/.clang-tidy @@ -0,0 +1,147 @@ +# Generated from CLion Inspection settings +--- +Checks: '-*, +bugprone-argument-comment, +bugprone-assert-side-effect, +bugprone-bad-signal-to-kill-thread, +bugprone-branch-clone, +bugprone-copy-constructor-init, +bugprone-dangling-handle, +bugprone-dynamic-static-initializers, +bugprone-fold-init-type, +bugprone-forward-declaration-namespace, +bugprone-forwarding-reference-overload, +bugprone-inaccurate-erase, +bugprone-incorrect-roundings, +bugprone-integer-division, +bugprone-lambda-function-name, +bugprone-macro-parentheses, +bugprone-macro-repeated-side-effects, +bugprone-misplaced-operator-in-strlen-in-alloc, +bugprone-misplaced-pointer-arithmetic-in-alloc, +bugprone-misplaced-widening-cast, +bugprone-move-forwarding-reference, +bugprone-multiple-statement-macro, +bugprone-no-escape, +bugprone-parent-virtual-call, +bugprone-posix-return, +bugprone-reserved-identifier, +bugprone-sizeof-container, +bugprone-sizeof-expression, +bugprone-spuriously-wake-up-functions, +bugprone-string-constructor, +bugprone-string-integer-assignment, +bugprone-string-literal-with-embedded-nul, +bugprone-suspicious-enum-usage, +bugprone-suspicious-include, +bugprone-suspicious-memset-usage, +bugprone-suspicious-missing-comma, +bugprone-suspicious-semicolon, +bugprone-suspicious-string-compare, +bugprone-suspicious-memory-comparison, +bugprone-suspicious-realloc-usage, +bugprone-swapped-arguments, +bugprone-terminating-continue, +bugprone-throw-keyword-missing, +bugprone-too-small-loop-variable, +bugprone-undefined-memory-manipulation, +bugprone-undelegated-constructor, +bugprone-unhandled-self-assignment, +bugprone-unused-raii, +bugprone-unused-return-value, +bugprone-use-after-move, +bugprone-virtual-near-miss, +cert-dcl21-cpp, +cert-dcl58-cpp, +cert-err34-c, +cert-err52-cpp, +cert-err60-cpp, +cert-flp30-c, +cert-msc50-cpp, +cert-msc51-cpp, +cert-str34-c, +cppcoreguidelines-interfaces-global-init, +cppcoreguidelines-narrowing-conversions, +cppcoreguidelines-pro-type-member-init, +cppcoreguidelines-pro-type-static-cast-downcast, +cppcoreguidelines-slicing, +google-default-arguments, +google-explicit-constructor, +google-runtime-operator, +hicpp-exception-baseclass, +hicpp-multiway-paths-covered, +misc-misplaced-const, +misc-new-delete-overloads, +misc-no-recursion, +misc-non-copyable-objects, +misc-throw-by-value-catch-by-reference, +misc-unconventional-assign-operator, +misc-uniqueptr-reset-release, +modernize-avoid-bind, +modernize-concat-nested-namespaces, +modernize-deprecated-headers, +modernize-deprecated-ios-base-aliases, +modernize-loop-convert, +modernize-make-shared, +modernize-make-unique, +modernize-pass-by-value, +modernize-raw-string-literal, +modernize-redundant-void-arg, +modernize-replace-auto-ptr, +modernize-replace-disallow-copy-and-assign-macro, +modernize-replace-random-shuffle, +modernize-return-braced-init-list, +modernize-shrink-to-fit, +modernize-unary-static-assert, +modernize-use-auto, +modernize-use-bool-literals, +modernize-use-emplace, +modernize-use-equals-default, +modernize-use-equals-delete, +modernize-use-nodiscard, +modernize-use-noexcept, +modernize-use-nullptr, +modernize-use-override, +modernize-use-transparent-functors, +modernize-use-uncaught-exceptions, +mpi-buffer-deref, +mpi-type-mismatch, +openmp-use-default-none, +performance-faster-string-find, +performance-for-range-copy, +performance-implicit-conversion-in-loop, +performance-inefficient-algorithm, +performance-inefficient-string-concatenation, +performance-inefficient-vector-operation, +performance-move-const-arg, +performance-move-constructor-init, +performance-no-automatic-move, +performance-noexcept-move-constructor, +performance-trivially-destructible, +performance-type-promotion-in-math-fn, +performance-unnecessary-copy-initialization, +performance-unnecessary-value-param, +portability-simd-intrinsics, +readability-avoid-const-params-in-decls, +readability-const-return-type, +readability-container-size-empty, +readability-convert-member-functions-to-static, +readability-delete-null-pointer, +readability-deleted-default, +readability-inconsistent-declaration-parameter-name, +readability-make-member-function-const, +readability-misleading-indentation, +readability-misplaced-array-index, +readability-non-const-parameter, +readability-redundant-control-flow, +readability-redundant-declaration, +readability-redundant-function-ptr-dereference, +readability-redundant-smartptr-get, +readability-redundant-string-cstr, +readability-redundant-string-init, +readability-simplify-subscript-expr, +readability-static-accessed-through-instance, +readability-static-definition-in-anonymous-namespace, +readability-string-compare, +readability-uniqueptr-delete-release, +readability-use-anyofallof' \ No newline at end of file diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..90058ad --- /dev/null +++ b/.gitignore @@ -0,0 +1,2 @@ +cmake-build-* +.idea diff --git a/AppStreamParser.cpp b/AppStreamParser.cpp new file mode 100644 index 0000000..1a9d185 --- /dev/null +++ b/AppStreamParser.cpp @@ -0,0 +1,513 @@ +/* + * Copyright 2024 Joel Winarske + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "AppStreamParser.h" +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +// Define the chunk size +constexpr size_t CHUNK_SIZE = 1024; + +int convertToInt(const std::string &str) { + try { + const int result = std::stoi(str); + return result; + } catch (const std::invalid_argument &e) { + spdlog::error("Invalid argument: could not convert to int: {}", str); + throw; + } catch (const std::out_of_range &e) { + spdlog::error("Out of range: value is too large to fit in an int: {}", str); + throw; + } +} + +size_t convertToSizeT(const char *str) { + char *endPtr; + errno = 0; + const long long result = std::strtoll(str, &endPtr, 10); + + // Check for various possible errors + if ((errno == ERANGE && (result == LLONG_MAX || result == LLONG_MIN)) || + (errno != 0 && result == 0)) { + std::perror("Conversion error"); + return 0; // Or handle the error case more gracefully + } + + if (endPtr == str) { + spdlog::error("Invalid argument: could not convert to size_t: {}", str); + return 0; // Or handle the error case more gracefully + } + + if (result < 0 || static_cast(result) > std::numeric_limits::max()) { + spdlog::error("Value out of range: {}", str); + return 0; // Or handle the error case more gracefully + } + + return result; +} + +// Function to parse a const char* as a Unix timestamp and convert to ISO 8601 +std::string unixEpochToISO8601(const char *epochStr) { + // Convert the input const char* to a time_t + const std::time_t epoch = std::stoll(epochStr); + + // Convert epoch to a time_point + const std::chrono::system_clock::time_point tp = std::chrono::system_clock::from_time_t(epoch); + + // Convert time_point to std::time_t to use std::gmtime + const std::time_t time = std::chrono::system_clock::to_time_t(tp); + const std::tm tm = *std::gmtime(&time); + + // Use stringstream to format the time as ISO 8601 + std::stringstream ss; + ss << std::put_time(&tm, "%Y-%m-%dT%H:%M:%S") << 'Z'; + + return ss.str(); +} + +void AppStreamParser::startElementCallback(void *user_data, const xmlChar *name, const xmlChar **attrs) { + auto *parser = static_cast(user_data); + const auto tag = reinterpret_cast(name); + parser->state_.currentElement = reinterpret_cast(name); + parser->state_.currentData.clear(); + + if (strcmp(tag, "component") == 0) { + parser->state_.insideComponent = true; + parser->state_.currentComponent = Component(); + return; + } + + if (strcmp(tag, "releases") == 0) { + parser->state_.insideReleases = true; + return; + } + + if (parser->state_.insideReleases) { + if (strcmp(tag, "release") == 0) { + parser->state_.currentRelease = Component::Release(); + // spec defaults + parser->state_.currentRelease.type = Component::ReleaseType::STABLE; + parser->state_.currentRelease.urgency = Component::ReleaseUrgency::MEDIUM; + if (attrs) { + for (int i = 0; attrs[i]; i += 2) { + const auto key = reinterpret_cast(attrs[i]); + const auto value = reinterpret_cast(attrs[i + 1]); + long unsigned int value_len = xmlStrlen(attrs[i + 1]); + if (strcmp(key, "type") == 0) { + parser->state_.currentRelease.type = Component::stringToReleaseType({value, value_len}); + } else if (strcmp(key, "version") == 0) { + parser->state_.currentRelease.version = {value, value_len}; + } else if (strcmp(key, "date") == 0) { + parser->state_.currentRelease.date = {value, value_len}; + } else if (strcmp(key, "timestamp") == 0) { + parser->state_.currentRelease.timestamp = unixEpochToISO8601(value); + } else if (strcmp(key, "date_eol") == 0) { + parser->state_.currentRelease.date_eol = {value, value_len}; + } else if (strcmp(key, "urgency") == 0) { + parser->state_.currentRelease.urgency = Component::stringToReleaseUrgency({value, value_len}); + } + } + } + return; + } + if (strcmp(tag, "issues") == 0) { + parser->state_.insideIssues = true; + return; + } + if (strcmp(tag, "issue") == 0) { + parser->state_.currentIssue = Component::Issue(); + // spec default + if (attrs) { + for (int i = 0; attrs[i]; i += 2) { + const auto key = reinterpret_cast(attrs[i]); + if (strcmp(key, "type") == 0) { + const auto value = reinterpret_cast(attrs[i + 1]); + long unsigned int value_len = xmlStrlen(attrs[i + 1]); + parser->state_.currentIssue.type = Component::stringToIssueType({value, value_len}); + } + if (strcmp(key, "url") == 0) { + const auto value = reinterpret_cast(attrs[i + 1]); + long unsigned int value_len = xmlStrlen(attrs[i + 1]); + parser->state_.currentIssue.url = {value, value_len}; + } + } + } + return; + } + if (strcmp(tag, "artifact") == 0) { + parser->state_.insideArtifact = true; + parser->state_.currentArtifact = Component::Artifact(); + } + } + + if (strcmp(tag, "icon") == 0) { + parser->state_.currentIcon = Component::Icon(); + if (attrs) { + for (int i = 0; attrs[i]; i += 2) { + const auto key = reinterpret_cast(attrs[i]); + const auto value = reinterpret_cast(attrs[i + 1]); + long unsigned int value_len = xmlStrlen(attrs[i + 1]); + if (strcmp(key, "type") == 0) { + parser->state_.currentIcon.type = Component::stringToIconType({value, value_len}); + } else if (strcmp(key, "width") == 0) { + parser->state_.currentIcon.width = convertToInt({value, value_len}); + } else if (strcmp(key, "height") == 0) { + parser->state_.currentIcon.height = convertToInt({value, value_len}); + } else if (strcmp(key, "scale") == 0) { + parser->state_.currentIcon.scale = convertToInt({value, value_len}); + } + } + } + return; + } + + if (attrs) { + for (int i = 0; attrs[i]; i += 2) { + const auto key = reinterpret_cast(attrs[i]); + const auto value = reinterpret_cast(attrs[i + 1]); + long unsigned int value_len = xmlStrlen(attrs[i + 1]); + if (strcmp(key, "xml:lang") == 0) { + if (const std::string lang(value); + !parser->state_.language.empty() && lang != parser->state_.language) { + parser->state_.currentElement = {}; + } + break; + } + if (strcmp(tag, "developer") == 0 && strcmp(key, "id") == 0) { + parser->state_.currentComponent.developer.id = value; + parser->state_.currentDeveloper = true; + break; + } + if (strcmp(tag, "bundle") == 0 && strcmp(key, "type") == 0) { + parser->state_.currentComponent.bundle.type = Component::stringToBundleType({value, value_len}); + break; + } + if (strcmp(tag, "url") == 0 && strcmp(key, "type") == 0) { + parser->state_.urlType = Component::stringToUrlType({value, value_len}); + break; + } + if (strcmp(tag, "launchable") == 0 && strcmp(key, "type") == 0) { + parser->state_.launchableType = Component::stringToLaunchableType({value, value_len}); + break; + } + } + } +} + +void AppStreamParser::endElementCallback(void *user_data, const xmlChar *name) { + auto *parser = static_cast(user_data); + const std::string currentElement(reinterpret_cast(name)); + + if (parser->state_.insideComponent) { + if (currentElement == "id") { + parser->state_.currentComponent.id = parser->state_.currentData; + } else if (currentElement == "pkgname") { + parser->state_.currentComponent.pkgname = parser->state_.currentData; + } else if (currentElement == "source_pkgname") { + parser->state_.currentComponent.source_pkgname = parser->state_.currentData; + } else if (currentElement == "name") { + if (parser->state_.currentDeveloper) { + parser->state_.currentComponent.developer.name = parser->state_.currentData; + } else { + parser->state_.currentComponent.name = parser->state_.currentData; + } + } else if (currentElement == "project_license") { + parser->state_.currentComponent.projectLicense = parser->state_.currentData; + } else if (currentElement == "summary") { + parser->state_.currentComponent.summary = parser->state_.currentData; + } else if (currentElement == "description") { + if (parser->state_.insideReleases) { + parser->state_.currentRelease.description = parser->state_.currentData; + } else { + parser->state_.currentComponent.description = parser->state_.currentData; + } + } else if (currentElement == "url") { + if (parser->state_.insideReleases) { + parser->state_.currentRelease.url = parser->state_.currentData; + } else { + if (parser->state_.urlType == Component::UrlType::HELP) { + parser->state_.currentComponent.url.help = parser->state_.currentData; + } else if (parser->state_.urlType == Component::UrlType::CONTACT) { + parser->state_.currentComponent.url.contact = parser->state_.currentData; + } else if (parser->state_.urlType == Component::UrlType::DONATION) { + parser->state_.currentComponent.url.donation = parser->state_.currentData; + } else if (parser->state_.urlType == Component::UrlType::HOMEPAGE) { + parser->state_.currentComponent.url.homepage = parser->state_.currentData; + } else if (parser->state_.urlType == Component::UrlType::TRANSLATE) { + parser->state_.currentComponent.url.translate = parser->state_.currentData; + } else if (parser->state_.urlType == Component::UrlType::FAQ) { + parser->state_.currentComponent.url.faq = parser->state_.currentData; + } else if (parser->state_.urlType == Component::UrlType::BUGTRACKER) { + parser->state_.currentComponent.url.bugtracker = parser->state_.currentData; + } else if (parser->state_.urlType == Component::UrlType::CONTRIBUTE) { + parser->state_.currentComponent.url.contribute = parser->state_.currentData; + } else if (parser->state_.urlType == Component::UrlType::VCS_BROWSER) { + parser->state_.currentComponent.url.vcs_browser = parser->state_.currentData; + } else { + parser->state_.currentComponent.url.unknown = parser->state_.currentData; + } + } + } else if (currentElement == "project_group") { + parser->state_.currentComponent.project_group = parser->state_.currentData; + } else if (currentElement == "compulsory_for_desktop") { + parser->state_.currentComponent.compulsory_for_desktop.push_back( + Component::stringToCompulsoryForDesktop(parser->state_.currentData)); + } else if (currentElement == "developer") { + parser->state_.currentDeveloper = false; + } else if (currentElement == "launchable") { + if (parser->state_.launchableType == Component::LaunchableType::URL) { + parser->state_.currentComponent.launchable.url = parser->state_.currentData; + } else if (parser->state_.launchableType == Component::LaunchableType::SERVICE) { + parser->state_.currentComponent.launchable.service = parser->state_.currentData; + } else if (parser->state_.launchableType == Component::LaunchableType::DESKTOP_ID) { + parser->state_.currentComponent.launchable.desktop_id = parser->state_.currentData; + } else if (parser->state_.launchableType == Component::LaunchableType::COCKPIT_MANIFEST) { + parser->state_.currentComponent.launchable.cockpit_manifest = parser->state_.currentData; + } else { + spdlog::error("Unknown launchable type: {}", parser->state_.currentData); + } + } else if (currentElement == "artifact") { + parser->state_.insideArtifact = false; + parser->state_.currentRelease.artifacts.push_back(parser->state_.currentArtifact); + } else if (parser->state_.insideArtifact && currentElement == "location") { + parser->state_.currentArtifact.location = parser->state_.currentData; + } else if (parser->state_.insideArtifact && currentElement == "checksum") { + parser->state_.currentArtifact.checksum[parser->state_.currentArtifactChecksumKey] = parser->state_. + currentData; + } else if (parser->state_.insideArtifact && currentElement == "size") { + parser->state_.currentArtifact.size[parser->state_.currentArtifactSizeKey] = convertToSizeT( + parser->state_.currentData.c_str()); + } else if (currentElement == "bundle") { + parser->state_.currentComponent.bundle.id = parser->state_.currentData; + } else if (currentElement == "content_rating") { + parser->state_.currentComponent.content_rating = parser->state_.currentData; + } else if (currentElement == "agreement") { + parser->state_.currentComponent.agreement = parser->state_.currentData; + } else if (currentElement == "keyword") { + parser->state_.currentComponent.keywords.push_back(parser->state_.currentData); + } else if (currentElement == "category") { + parser->state_.currentComponent.categories.push_back(parser->state_.currentData); + } else if (currentElement == "icon") { + parser->state_.currentIcon.value = parser->state_.currentData; + parser->state_.currentComponent.icons.push_back(parser->state_.currentIcon); + } else if (currentElement == "suggest") { + parser->state_.currentComponent.suggests.push_back(parser->state_.currentData); + } else if (currentElement == "media_baseurl") { + parser->state_.currentComponent.media_baseurl = parser->state_.currentData; + } else if (currentElement == "architecture") { + parser->state_.currentComponent.architecture = parser->state_.currentData; + } else if (currentElement == "releases") { + parser->state_.insideReleases = false; + } else if (currentElement == "release") { + parser->state_.currentComponent.releases.push_back(parser->state_.currentRelease); + } else if (currentElement == "issues") { + parser->state_.insideIssues = false; + } else if (currentElement == "issue") { + parser->state_.currentRelease.issues.push_back(parser->state_.currentIssue); + } else if (currentElement == "language") { + parser->state_.currentComponent.addSupportedLanguage(parser->state_.currentData); + } else if (currentElement == "component") { + parser->state_.insideComponent = false; + assert(!parser->state_.currentComponent.id.empty()); + if (!parser->components_.count(parser->state_.currentComponent.id)) { + parser->components_[parser->state_.currentComponent.id] = parser->state_.currentComponent; + } else { + SPDLOG_WARN("Duplicate: [{}]", parser->state_.currentComponent.id); + } + } + } + + parser->state_.currentData.clear(); + parser->state_.currentElement = {}; + parser->state_.currentIcon = {}; +} + +void AppStreamParser::charactersCallback(void *user_data, const xmlChar *ch, const int len) { + spdlog::debug("{}", std::string_view(reinterpret_cast(ch), len)); + auto *parser = static_cast(user_data); + if (!parser->state_.currentElement.empty()) { + parser->state_.currentData.append(reinterpret_cast(ch), len); + } +} + +AppStreamParser::AppStreamParser(const std::string &filename, const std::string &language) + : language_(language) { + state_.language = language; + parseFile(filename); +} + +AppStreamParser::~AppStreamParser() { + munmapFile(); +} + +void AppStreamParser::mmapFile(const std::string &filename) { + int fd = open(filename.c_str(), O_RDONLY); + if (fd == -1) { + spdlog::error("Failed to open file: {}", filename); + exit(EXIT_FAILURE); + } + + struct stat sb{}; + if (fstat(fd, &sb) == -1) { + spdlog::error("Failed to get file size: {}", filename); + close(fd); + exit(EXIT_FAILURE); + } + fileSize_ = sb.st_size; + + fileData_ = mmap(nullptr, fileSize_, PROT_READ, MAP_PRIVATE, fd, 0); + // Close the file descriptor as it's no longer needed after mapping + close(fd); + if (fileData_ == MAP_FAILED) { + spdlog::error("Failed to memory-map file: {}", filename); + exit(EXIT_FAILURE); + } +} + +void AppStreamParser::munmapFile() { + if (fileData_ && fileData_ != MAP_FAILED) { + munmap(fileData_, fileSize_); + fileData_ = nullptr; + } +} + +void AppStreamParser::parseFile(const std::string &filename) { + mmapFile(filename); + + xmlSAXHandler saxHandler = { + .startElement = startElementCallback, + .endElement = endElementCallback, + .characters = charactersCallback, + }; + + spdlog::info("Parsing file: {}", filename); + + std::unique_ptr ctxt( + xmlCreatePushParserCtxt(&saxHandler, this, static_cast(fileData_), 4, filename.c_str()), + xmlFreeParserCtxt); + + size_t offset = 4; + while (offset < fileSize_) { + const size_t chunkSize = std::min(CHUNK_SIZE, fileSize_ - offset); + if (int ret = xmlParseChunk(ctxt.get(), static_cast(fileData_) + offset, + static_cast(chunkSize), 0); + ret != 0) { + spdlog::error("Failed to parse XML, error code: {}", ret); + munmapFile(); + exit(EXIT_FAILURE); + } + offset += chunkSize; + } + + // Send an EOF indication + if (int ret = xmlParseChunk(ctxt.get(), nullptr, 0, 1); ret != 0) { + spdlog::error("Failed to parse XML, error code: {}", ret); + munmapFile(); + exit(EXIT_FAILURE); + } + + munmapFile(); +} + +std::vector AppStreamParser::getUniqueCategories() { + std::unordered_set uniqueCategories; + + for (const auto &[key, component]: components_) { + uniqueCategories.insert(component.categories.begin(), component.categories.end()); + } + + return {uniqueCategories.begin(), uniqueCategories.end()}; +} + +std::vector AppStreamParser::getUniqueKeywords() { + std::unordered_set uniqueKeywords; + + for (const auto &[key, component]: components_) { + uniqueKeywords.insert(component.keywords.begin(), component.keywords.end()); + } + + return {uniqueKeywords.begin(), uniqueKeywords.end()}; +} + +std::vector AppStreamParser::getSortedComponents(const SortOption option) { + // Convert unordered_map to vector for sorting + std::vector sortedComponents; + sortedComponents.reserve(components_.size()); + for (const auto &[key, component]: components_) { + sortedComponents.push_back(component); + } + + // Sort based on the option + switch (option) { + case SortOption::BY_ID: + std::sort(sortedComponents.begin(), sortedComponents.end(), [](const Component &a, const Component &b) { + return a.id < b.id; + }); + break; + case SortOption::BY_NAME: + std::sort(sortedComponents.begin(), sortedComponents.end(), [](const Component &a, const Component &b) { + return a.name < b.name; + }); + break; + default: + throw std::invalid_argument("Invalid sort option"); + } + + return sortedComponents; +} + +std::vector AppStreamParser::searchByCategory(const std::string &category) { + std::vector result; + + for (const auto &[key, component]: components_) { + if (std::find(component.categories.begin(), component.categories.end(), category) != component.categories. + end()) { + result.push_back(component); + } + } + + return result; +} + +std::vector AppStreamParser::searchByKeyword(const std::string &keyword) { + std::vector result; + + for (const auto &[key, component]: components_) { + if (std::find(component.keywords.begin(), component.keywords.end(), keyword) != component.keywords.end()) { + result.push_back(component); + } + } + + return result; +} + +size_t AppStreamParser::getTotalComponentCount() const { + return components_.size(); +} + +const std::map &AppStreamParser::getComponents() const { + return components_; +} diff --git a/AppStreamParser.h b/AppStreamParser.h new file mode 100644 index 0000000..4a39254 --- /dev/null +++ b/AppStreamParser.h @@ -0,0 +1,97 @@ +/* +* Copyright 2024 Joel Winarske + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef APPSTREAMPARSER_H +#define APPSTREAMPARSER_H + +#include "Component.h" +#include +#include +#include +#include + + +class AppStreamParser { +public: + explicit AppStreamParser(const std::string &filename, const std::string &language); + + ~AppStreamParser(); + + std::vector getUniqueCategories(); + + std::vector getUniqueKeywords(); + + std::vector searchByCategory(const std::string &category); + + std::vector searchByKeyword(const std::string &keyword); + + enum class SortOption { BY_ID, BY_NAME }; + + std::vector getSortedComponents(SortOption option); + + [[nodiscard]] size_t getTotalComponentCount() const; + + [[nodiscard]] const std::map &getComponents() const; + +private: + static constexpr char kEmptyString[] = ""; + static constexpr char kReleaseTypeStable[] = "stable"; + static constexpr char kReleaseUrgencyMedium[] = "medium"; + static constexpr char kIssueTypeGeneric[] = "generic"; + + std::map components_; + std::string language_; + + struct ParsingState { + bool insideComponent = false; + bool insideReleases = false; + bool insideIssues = false; + bool insideArtifact = true; + bool currentDeveloper = false; + + std::string currentElement; + Component currentComponent; + Component::Icon currentIcon; + std::string currentData; + Component::UrlType urlType; + Component::LaunchableType launchableType; + Component::Release currentRelease; + Component::Issue currentIssue; + Component::Artifact currentArtifact; + std::string currentArtifactChecksumKey; + std::string currentArtifactSizeKey; + std::string language; + }; + + ParsingState state_; + + static void startElementCallback(void *user_data, const xmlChar *name, const xmlChar **attrs); + + static void endElementCallback(void *user_data, const xmlChar *name); + + static void charactersCallback(void *user_data, const xmlChar *ch, int len); + + size_t fileSize_ = 0; + void *fileData_ = nullptr; + + void parseFile(const std::string &filename); + + void mmapFile(const std::string &filename); + + void munmapFile(); +}; + +#endif // APPSTREAMPARSER_H diff --git a/CMakeLists.txt b/CMakeLists.txt new file mode 100644 index 0000000..e2966a8 --- /dev/null +++ b/CMakeLists.txt @@ -0,0 +1,27 @@ +cmake_minimum_required(VERSION 3.14) +project(appstream_sax) + +set(CMAKE_CXX_STANDARD 17) +set(CMAKE_CXX_STANDARD_REQUIRED ON) + +find_package(LibXml2 REQUIRED) + +add_executable(${PROJECT_NAME} + AppStreamParser.cpp + Component.cpp + AppStreamParser.h + Component.h + main.cpp +) + +include(FetchContent) + +FetchContent_Declare( + spdlog + GIT_REPOSITORY https://github.com/gabime/spdlog.git + GIT_TAG v1.15.0 +) + +FetchContent_MakeAvailable(spdlog) + +target_link_libraries(${PROJECT_NAME} PRIVATE spdlog::spdlog LibXml2::LibXml2) \ No newline at end of file diff --git a/Component.cpp b/Component.cpp new file mode 100644 index 0000000..e39329f --- /dev/null +++ b/Component.cpp @@ -0,0 +1,326 @@ +/* +* Copyright 2024 Joel Winarske + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "Component.h" + +#include "spdlog/spdlog.h" + +Component::BundleType Component::stringToBundleType(const std::string &typeStr) { + if (typeStr == "package") return BundleType::PACKAGE; + if (typeStr == "limba") return BundleType::LIMBA; + if (typeStr == "flatpak") return BundleType::FLATPAK; + if (typeStr == "appimage") return BundleType::APPIMAGE; + if (typeStr == "snap") return BundleType::SNAP; + if (typeStr == "tarball") return BundleType::TARBALL; + if (typeStr == "cabinet") return BundleType::CABINET; + if (typeStr == "linglong") return BundleType::LINGLONG; + return BundleType::UNKNOWN; +} + +std::string Component::bundleTypeToString(const BundleType type) { + switch (type) { + case BundleType::PACKAGE: return "package"; + case BundleType::LIMBA: return "limba"; + case BundleType::FLATPAK: return "flatpak"; + case BundleType::APPIMAGE: return "appimage"; + case BundleType::SNAP: return "snap"; + case BundleType::TARBALL: return "tarball"; + case BundleType::CABINET: return "cabinet"; + case BundleType::LINGLONG: return "linglong"; + default: return "unknown"; + } +} + +Component::IconType Component::stringToIconType(const std::string &typeStr) { + if (typeStr == "stock") return Component::IconType::STOCK; + if (typeStr == "cached") return Component::IconType::CACHED; + if (typeStr == "local") return Component::IconType::LOCAL; + if (typeStr == "url") return Component::IconType::URL; + if (typeStr == "remote") return Component::IconType::REMOTE; + return IconType::UNKNOWN; +} + +std::string Component::iconTypeToString(const IconType type) { + switch (type) { + case IconType::STOCK: return "stock"; + case IconType::CACHED: return "cached"; + case IconType::LOCAL: return "local"; + case IconType::URL: return "url"; + case IconType::REMOTE: return "remote"; + default: return "unknown"; + } +} + +Component::CompulsoryForDesktop Component::stringToCompulsoryForDesktop(const std::string &desktopString) { + static const std::unordered_map stringToEnumMap = { + {"COSMIC", CompulsoryForDesktop::COSMIC}, + {"GNOME", CompulsoryForDesktop::GNOME}, + {"GNOME-Classic", CompulsoryForDesktop::GNOME_Classic}, + {"GNOME-Flashback", CompulsoryForDesktop::GNOME_Flashback}, + {"KDE", CompulsoryForDesktop::KDE}, + {"LXDE", CompulsoryForDesktop::LXDE}, + {"LXQt", CompulsoryForDesktop::LXQt}, + {"MATE", CompulsoryForDesktop::MATE}, + {"Razor", CompulsoryForDesktop::Razor}, + {"ROX", CompulsoryForDesktop::ROX}, + {"TDE", CompulsoryForDesktop::TDE}, + {"Unity", CompulsoryForDesktop::Unity}, + {"XFCE", CompulsoryForDesktop::XFCE}, + {"EDE", CompulsoryForDesktop::EDE}, + {"Cinnamon", CompulsoryForDesktop::Cinnamon}, + {"Pantheon", CompulsoryForDesktop::Pantheon}, + {"DDE", CompulsoryForDesktop::DDE}, + {"Endless", CompulsoryForDesktop::Endless}, + {"Old", CompulsoryForDesktop::Old} + }; + + if (const auto it = stringToEnumMap.find(desktopString); it != stringToEnumMap.end()) { + return it->second; + } + return CompulsoryForDesktop::UNKNOWN; +} + +std::string Component::compulsoryForDesktopToString(const CompulsoryForDesktop desktopEnum) { + static const std::unordered_map enumToStringMap = { + {CompulsoryForDesktop::COSMIC, "COSMIC"}, + {CompulsoryForDesktop::GNOME, "GNOME"}, + {CompulsoryForDesktop::GNOME_Classic, "GNOME-Classic"}, + {CompulsoryForDesktop::GNOME_Flashback, "GNOME-Flashback"}, + {CompulsoryForDesktop::KDE, "KDE"}, + {CompulsoryForDesktop::LXDE, "LXDE"}, + {CompulsoryForDesktop::LXQt, "LXQt"}, + {CompulsoryForDesktop::MATE, "MATE"}, + {CompulsoryForDesktop::Razor, "Razor"}, + {CompulsoryForDesktop::ROX, "ROX"}, + {CompulsoryForDesktop::TDE, "TDE"}, + {CompulsoryForDesktop::Unity, "Unity"}, + {CompulsoryForDesktop::XFCE, "XFCE"}, + {CompulsoryForDesktop::EDE, "EDE"}, + {CompulsoryForDesktop::Cinnamon, "Cinnamon"}, + {CompulsoryForDesktop::Pantheon, "Pantheon"}, + {CompulsoryForDesktop::DDE, "DDE"}, + {CompulsoryForDesktop::Endless, "Endless"}, + {CompulsoryForDesktop::Old, "Old"} + }; + + if (const auto it = enumToStringMap.find(desktopEnum); it != enumToStringMap.end()) { + return it->second; + } + return "unknown"; +} + +Component::UrlType Component::stringToUrlType(const std::string &typeStr) { + if (typeStr == "homepage") return UrlType::HOMEPAGE; + if (typeStr == "bugtracker") return UrlType::BUGTRACKER; + if (typeStr == "faq") return UrlType::FAQ; + if (typeStr == "help") return UrlType::HELP; + if (typeStr == "donation") return UrlType::DONATION; + if (typeStr == "translate") return UrlType::TRANSLATE; + if (typeStr == "contact") return UrlType::CONTACT; + if (typeStr == "vcs-browser") return UrlType::VCS_BROWSER; + if (typeStr == "contribute") return UrlType::CONTRIBUTE; + return UrlType::UNKNOWN; +} + +Component::LaunchableType Component::stringToLaunchableType(const std::string &typeStr) { + if (typeStr == "desktop-id") return LaunchableType::DESKTOP_ID; + if (typeStr == "service") return LaunchableType::SERVICE; + if (typeStr == "cockpit-manifest") return LaunchableType::COCKPIT_MANIFEST; + if (typeStr == "url") return LaunchableType::URL; + return LaunchableType::UNKNOWN; +} + +Component::ReleaseType Component::stringToReleaseType(const std::string &typeStr) { + if (typeStr == "stable") return ReleaseType::STABLE; + if (typeStr == "development") return ReleaseType::DEVELOPMENT; + if (typeStr == "snapshot") return ReleaseType::SNAPSHOT; + return ReleaseType::UNKNOWN; +} + +Component::ReleaseUrgency Component::stringToReleaseUrgency(const std::string &typeStr) { + if (typeStr == "low") return ReleaseUrgency::LOW; + if (typeStr == "medium") return ReleaseUrgency::MEDIUM; + if (typeStr == "high") return ReleaseUrgency::HIGH; + if (typeStr == "critical") return ReleaseUrgency::CRITICAL; + return ReleaseUrgency::UNKNOWN; +} + +std::string Component::releaseUrgencyToString(const ReleaseUrgency type) { + if (type == ReleaseUrgency::LOW) return "low"; + if (type == ReleaseUrgency::MEDIUM) return "medium"; + if (type == ReleaseUrgency::HIGH) return "high"; + if (type == ReleaseUrgency::CRITICAL) return "critical"; + return "unknown"; +} + +Component::IssueType Component::stringToIssueType(const std::string &typeStr) { + if (typeStr == "generic") return IssueType::GENERIC; + if (typeStr == "cve") return IssueType::CVE; + return IssueType::UNKNOWN; +} + +std::string Component::releaseTypeToString(const ReleaseType type) { + switch (type) { + case ReleaseType::STABLE: return "stable"; + case ReleaseType::SNAPSHOT: return "snapshot"; + case ReleaseType::DEVELOPMENT: return "development"; + default: return "unknown"; + } +} + +std::string Component::issueTypeToString(const IssueType type) { + switch (type) { + case IssueType::GENERIC: return "generic"; + case IssueType::CVE: return "cve"; + default: return "unknown"; + } +} + +void Component::addSupportedLanguage(const std::string &language) { + supportedLanguages.push_back(language); +} + +void Component::Dump() const { + spdlog::info("id: {}", id); + spdlog::info("\tname: {}", name); + spdlog::info("\tproject_license: {}", projectLicense); + spdlog::info("\tsummary: {}", summary); + if (!pkgname.empty()) { + spdlog::info("\tpkgname: {}", pkgname); + } + if (!source_pkgname.empty()) { + spdlog::info("\tsource_pkgname: {}", source_pkgname); + } + if (!description.empty()) { + spdlog::info("\tdescription: {}", description); + } + if (!url.homepage.empty()) { + spdlog::info("\thomepage: {}", url.homepage); + } + if (!url.bugtracker.empty()) { + spdlog::info("\tbugtracker: {}", url.bugtracker); + } + if (!url.faq.empty()) { + spdlog::info("\tfaq: {}", url.faq); + } + if (!url.help.empty()) { + spdlog::info("\thelp: {}", url.help); + } + if (!url.donation.empty()) { + spdlog::info("\tdonation: {}", url.donation); + } + if (!url.translate.empty()) { + spdlog::info("\ttranslate: {}", url.translate); + } + if (!url.contact.empty()) { + spdlog::info("\tcontact: {}", url.contact); + } + if (!url.vcs_browser.empty()) { + spdlog::info("\tvcs_browser: {}", url.vcs_browser); + } + if (!url.contribute.empty()) { + spdlog::info("\tcontribute: {}", url.contribute); + } + if (!url.unknown.empty()) { + spdlog::info("\tunknown: {}", url.unknown); + } + if (!project_group.empty()) { + spdlog::info("\tproject_group: {}", project_group); + } + if (!developer.id.empty()) { + spdlog::info("\tdeveloper id: {}", developer.id); + } + if (!developer.name.empty()) { + spdlog::info("\tdeveloper name: {}", developer.name); + } + if (!launchable.desktop_id.empty()) { + spdlog::info("\tlaunchable desktop_id: {}", launchable.desktop_id); + } + if (!launchable.service.empty()) { + spdlog::info("\tlaunchable service: {}", launchable.service); + } + if (!launchable.cockpit_manifest.empty()) { + spdlog::info("\tlaunchable cockpit_manifest: {}", launchable.cockpit_manifest); + } + if (!launchable.url.empty()) { + spdlog::info("\tlaunchable url: {}", launchable.url); + } + for (const auto &comp: compulsory_for_desktop) { + spdlog::info("compulsory_for_desktop: {}", Component::compulsoryForDesktopToString(comp)); + } + for (const auto &[type, value, width, height, scale]: icons) { + spdlog::info("\ticon"); + spdlog::info("\t\ttype: {}", Component::iconTypeToString(type)); + spdlog::info("\t\tvalue: {}", value); + if (width.has_value()) { + spdlog::info("\t\twidth: {}", width.value()); + } + if (height.has_value()) { + spdlog::info("\t\theight: {}", height.value()); + } + if (scale.has_value()) { + spdlog::info("\t\tscale: {}", scale.value()); + } + } + for (const auto &[type, version, date, timestamp, date_eol, urgency, description, url, issues, artifacts]: + releases) { + spdlog::info("\trelease"); + spdlog::info("\t\ttype: {}", releaseTypeToString(type)); + spdlog::info("\t\tversion: {}", version); + if (!date.empty()) { + spdlog::info("\t\tdate: {}", date); + } + if (!timestamp.empty()) { + spdlog::info("\t\ttimestamp: {}", timestamp); + } + if (!date_eol.empty()) { + spdlog::info("\t\tdate_eol: {}", date_eol); + } + spdlog::info("\t\turgency: {}", releaseUrgencyToString(urgency)); + if (!description.empty()) { + spdlog::info("\t\tdescription: {}", description); + } + if (!url.empty()) { + spdlog::info("\t\turl: {}", url); + } + for (const auto &[type, url, value]: issues) { + spdlog::info("\t\tissue"); + spdlog::info("\t\t\ttype: {}", issueTypeToString(type)); + if (!url.empty()) { + spdlog::info("\t\t\turl: {}", url); + } + if (!value.empty()) { + spdlog::info("\t\t\tvalue: {}", value); + } + } + for (const auto &[location, checksum, size]: artifacts) { + spdlog::info("\t\tartifact"); + if (!location.empty()) { + spdlog::info("\t\t\tlocation: {}", location); + } + if (!checksum.empty()) { + for (const auto &[key, value]: checksum) { + spdlog::info("\t\t\tchecksum: {} = {}", key, value); + } + } + if (!size.empty()) { + for (const auto &[key, value]: size) { + spdlog::info("\t\t\tsize: {} = {}", key, value); + } + } + } + } +} diff --git a/Component.h b/Component.h new file mode 100644 index 0000000..b200973 --- /dev/null +++ b/Component.h @@ -0,0 +1,235 @@ +/* +* Copyright 2024 Joel Winarske + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef COMPONENT_H +#define COMPONENT_H + +#include +#include +#include +#include +#include + + +class Component { +public: + enum class BundleType { + UNKNOWN = 0, + PACKAGE, + LIMBA, + FLATPAK, + APPIMAGE, + SNAP, + TARBALL, + CABINET, + LINGLONG + }; + + struct Bundle { + std::string id; + BundleType type; + }; + + enum class IconType { + UNKNOWN = 0, + STOCK, + CACHED, + LOCAL, + URL, + REMOTE + }; + + struct Icon { + IconType type; + std::string value; + std::optional width; + std::optional height; + std::optional scale; + }; + + enum class CompulsoryForDesktop { + UNKNOWN = 0, + COSMIC, + GNOME, + GNOME_Classic, + GNOME_Flashback, + KDE, + LXDE, + LXQt, + MATE, + Razor, + ROX, + TDE, + Unity, + XFCE, + EDE, + Cinnamon, + Pantheon, + DDE, + Endless, + Old + }; + + enum class UrlType { + UNKNOWN = 0, + HOMEPAGE, + BUGTRACKER, + FAQ, + HELP, + DONATION, + TRANSLATE, + CONTACT, + VCS_BROWSER, + CONTRIBUTE + }; + + enum class LaunchableType { + UNKNOWN, + DESKTOP_ID, + SERVICE, + COCKPIT_MANIFEST, + URL + }; + + enum class ReleaseType { + UNKNOWN = 0, + STABLE, + DEVELOPMENT, + SNAPSHOT + }; + + enum class ReleaseUrgency { + UNKNOWN = 0, + LOW, + MEDIUM, + HIGH, + CRITICAL + }; + + enum class IssueType { + UNKNOWN, + GENERIC, + CVE + }; + + struct Artifact { + std::string location; + std::unordered_map checksum; + std::unordered_map size; + }; + + struct Issue { + IssueType type; + std::string url; + std::string value; + }; + + struct Release { + ReleaseType type; + std::string version; + std::string date; + std::string timestamp; + std::string date_eol; + ReleaseUrgency urgency; + std::string description; + std::string url; + std::vector issues; + std::vector artifacts; + }; + + std::string id; + std::string pkgname; + std::string source_pkgname; + std::string name; + std::string summary; + std::string projectLicense; + std::string description; + + struct { + std::string homepage; + std::string bugtracker; + std::string faq; + std::string help; + std::string donation; + std::string translate; + std::string contact; + std::string vcs_browser; + std::string contribute; + std::string unknown; + } url; + + std::string project_group; + std::vector icons; + std::vector compulsory_for_desktop; + + struct { + std::string id; + std::string name; + } developer; + + struct { + LaunchableType type; + std::string desktop_id; + std::string service; + std::string cockpit_manifest; + std::string url; + } launchable; + + std::string media_baseurl; + std::string architecture; + Bundle bundle; + std::string content_rating; + std::string agreement; + std::vector keywords; + std::vector categories; + std::vector suggests; + std::vector releases; + std::vector supportedLanguages; + + void Dump() const; + + void addSupportedLanguage(const std::string &language); + + static BundleType stringToBundleType(const std::string &typeStr); + + static IconType stringToIconType(const std::string &typeStr); + + static CompulsoryForDesktop stringToCompulsoryForDesktop(const std::string &desktopString); + + static UrlType stringToUrlType(const std::string &typeStr); + + static LaunchableType stringToLaunchableType(const std::string &typeStr); + + static ReleaseType stringToReleaseType(const std::string &typeStr); + + static ReleaseUrgency stringToReleaseUrgency(const std::string &typeStr); + + static IssueType stringToIssueType(const std::string &typeStr); + + static std::string bundleTypeToString(BundleType type); + + static std::string iconTypeToString(IconType type); + + static std::string compulsoryForDesktopToString(CompulsoryForDesktop desktopEnum); + + static std::string releaseTypeToString(ReleaseType type); + + static std::string releaseUrgencyToString(ReleaseUrgency type); + + static std::string issueTypeToString(IssueType type); +}; + +#endif // COMPONENT_H diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..ff81822 --- /dev/null +++ b/LICENSE @@ -0,0 +1,13 @@ +Copyright 2024 Joel Winarske + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. \ No newline at end of file diff --git a/README.md b/README.md new file mode 100644 index 0000000..24e0968 --- /dev/null +++ b/README.md @@ -0,0 +1,76 @@ +# appstream_parser + +An `Appstream 1.0` XML parser written in C++. + +Not yet spec compliant + +### Introduction + +Appstream XML 1.0 is used for describing Flatpak components. + +A Flatpak component repository serves an `appstream.xml.gz` file that describes all the components it hosts. + +One example of a Flatpak component registry is Flathub. The uncompressed `appstream.xml` that Flathub serves is quite +large: + +``` +-rw-rw-r-- 1 joel joel 36334733 Nov 11 09:22 appstream.xml +``` + +Working with large XML files presents challenges for limited resource computing devices. This application targets a +tradeoff between speed and runtime RAM footprint. + +The goal of this application is the basis for creating a Flutter software catalog. Not all the Appstream data is +required to present to the user, this helps to reduce the RAM footprint; we only store the data we need to present. + +#### DOM vs SAX parsers + +Using a DOM parser the XML file is read at once, and a RAM representation of the XML document is created. This creates a +very large runtime footprint which is not conducive to constrained memory device. + +SAX parsers read blocks at a time, and invoke user callbacks. This approach allows working with larger XML files with a +much lower memory footprint. + +#### Memory Mapped files + +The use of memory mapped files improves access time to the underlying file contents. The operating system handles paging +the file contents into RAM. + +#### Minimizing Parser RAM usage + +* Experiments show that libxml2 chunk reading generates the smallest RAM usage. +* Passing in the memory mapped file at once, ends up with a much larger RAM consumption. +* Parsing 1k chunks show the smallest RAM consumption. +* Increasing the read chunk size directly impacts RAM usage post parse. Which would indicate that the SAX parser cleans + up heap allocations after each chunk parse. + +#### Alternate XML libraries + +* pugixml - (DOM parser) produces the largest RAM footprint. Not usable. +* rapidxml - (DOM parser) produces second largest RAM footprint. It has some nice features, and is high quality work. A + SAX parser would be a nice addition. Note it does require C++20 minimum. + +#### std::string_view usage + +std::string_view can be used to wrap existing `non-null` strings. + +std::string_view works with memory mapped files. With all the parsers tested none will reference the original string +pointer directly. Likely related to string fixups, and XML spec requirements. If the source data is known and no string +fixups are required, on could reference the raw pointers directly. Which would make parsing a large XML file much +faster and smaller. + +Rapidxml leverages std::string_view, unfortunately tests show it produces 2x the RAM footprint. Likely this library +could be optimized to directly use pointers in the memory mapped file in order to limit heap allocations. + +There could be more work in this area to improve RAM consumption. + +This C library works around non-null strings in the XML by post-processing the XML data into a binary blob: +https://github.com/hughsie/libxmlb. This is a massive workaround to a problem better solved using std::string_view. + +### References + +* https://www.freedesktop.org/software/appstream/docs/ +* https://github.com/flatpak/ +* https://gitlab.gnome.org/GNOME/libxml2 +* https://github.com/dwd/rapidxml +* https://github.com/zeux/pugixml diff --git a/main.cpp b/main.cpp new file mode 100644 index 0000000..30f53aa --- /dev/null +++ b/main.cpp @@ -0,0 +1,185 @@ +/* +* Copyright 2024 Joel Winarske + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "AppStreamParser.h" +#include +#include +#include +#include +#include +#include +#include +#include +#include + +// Function to get memory usage from /proc/self/stat +void getMemoryUsage(double &vm_usage, double &resident_set) { + using std::ios_base; + using std::ifstream; + using std::string; + + vm_usage = 0.0; + resident_set = 0.0; + + // 'file' represents the file "/proc/self/stat" + ifstream file("/proc/self/stat", ios_base::in); + + // Read in the data from the file + string line; + std::getline(file, line); + file.close(); + + // Split the line by whitespace + std::istringstream iss(line); + std::vector fields; + string value; + while (iss >> value) { + fields.push_back(value); + } + + // fields[22] and fields[23] represent virtual memory and resident set size in kB + if (fields.size() >= 24) { + unsigned long vsize = std::stoul(fields[22]); + long rss = std::stol(fields[23]); + + long page_size_kb = sysconf(_SC_PAGE_SIZE) / 1024; // in case x86-64 is configured to use 2MB pages + vm_usage = vsize / 1024.0; + resident_set = rss * page_size_kb; + } +} + +// Function to get the file size in bytes +long getFileSize(const std::string &filename) { + struct stat stat_buf{}; + const int rc = stat(filename.c_str(), &stat_buf); + return rc == 0 ? stat_buf.st_size : -1; +} + +int main(const int argc, char *argv[]) { + if (argc < 2) { + spdlog::error("Usage: {} [language]", argv[0]); + return EXIT_FAILURE; + } + + std::string filename = argv[1]; + std::string language = (argc >= 3) ? argv[2] : ""; + + // Check if the file exists and get file size + const long filesize = getFileSize(filename); + if (filesize == -1) { + spdlog::error("File '{}' not found or could not be accessed.", filename); + return EXIT_FAILURE; + } + double filesize_mib = static_cast(filesize) / (1024.0 * 1024.0); + spdlog::info("File '{}' is present with size {:.2f} MiB.", filename, filesize_mib); + + double vm_usage, resident_set; + + // Before parser allocation + getMemoryUsage(vm_usage, resident_set); + spdlog::info("Before parser allocation - Virtual Memory: {} KB, Resident set size: {} KB", vm_usage, resident_set); + + try { + spdlog::info("Initializing AppStreamParser with file: '{}' and language: '{}'", filename, language); + + auto parser = std::make_unique(filename, language); + + // After parser allocation + getMemoryUsage(vm_usage, resident_set); + spdlog::info("After parser allocation - Virtual Memory: {} KB, Resident set size: {} KB", vm_usage, + resident_set); + + spdlog::info("Parsing completed. Total components: {}", parser->getTotalComponentCount()); + + // After parsing + getMemoryUsage(vm_usage, resident_set); + spdlog::info("After parsing - Virtual Memory: {} KB, Resident set size: {} KB", vm_usage, resident_set); + + const auto categories = parser->getUniqueCategories(); + spdlog::info("Unique Categories:"); + for (const auto &category: categories) { + spdlog::info("- {}", category); + } + + // After getting unique categories + getMemoryUsage(vm_usage, resident_set); + spdlog::info("After getting unique categories - Virtual Memory: {} KB, Resident set size: {} KB", vm_usage, + resident_set); + + const auto keywords = parser->getUniqueKeywords(); + spdlog::info("Unique Keywords:"); + for (const auto &keyword: keywords) { + spdlog::info("- {}", keyword); + } + + // After getting unique keywords + getMemoryUsage(vm_usage, resident_set); + spdlog::info("After getting unique keywords - Virtual Memory: {} KB, Resident set size: {} KB", vm_usage, + resident_set); + + // Example searches + const std::string sampleCategory = "utility"; + const auto componentsByCategory = parser->searchByCategory(sampleCategory); + spdlog::info("Components in category '{}', ({}):", sampleCategory, sampleCategory.size()); + for (const auto &component: componentsByCategory) { + component.Dump(); + } + + // After searching by category + getMemoryUsage(vm_usage, resident_set); + spdlog::info("After searching by category - Virtual Memory: {} KB, Resident set size: {} KB", vm_usage, + resident_set); + + const std::string sampleKeyword = "editor"; + const auto componentsByKeyword = parser->searchByKeyword(sampleKeyword); + spdlog::info("Components with keyword '{}', ({}):", sampleKeyword, componentsByKeyword.size()); + for (const auto &comp: componentsByKeyword) { + comp.Dump(); + } + + const auto components = parser->getComponents(); + // for (const auto &[fst, snd]: components) { + // printComponent(fst, snd); + // } + spdlog::info("Component Count: {}", components.size()); + + // After searching by keyword + getMemoryUsage(vm_usage, resident_set); + spdlog::info("Before sorting - Virtual Memory: {} KB, Resident set size: {} KB", vm_usage, + resident_set); + const auto sortedById = parser->getSortedComponents(AppStreamParser::SortOption::BY_ID); + // After searching by keyword + getMemoryUsage(vm_usage, resident_set); + spdlog::info("After sorting - Virtual Memory: {} KB, Resident set size: {} KB", vm_usage, + resident_set); + + // After searching by keyword + getMemoryUsage(vm_usage, resident_set); + spdlog::info("After searching by keyword - Virtual Memory: {} KB, Resident set size: {} KB", vm_usage, + resident_set); + + parser.reset(); + + getMemoryUsage(vm_usage, resident_set); + spdlog::info("After reseting parser - Virtual Memory: {} KB, Resident set size: {} KB", vm_usage, + resident_set); + } catch (const std::exception &e) { + spdlog::error("Exception occurred: {}", e.what()); + return EXIT_FAILURE; + } + + return EXIT_SUCCESS; +}