From 5ca57b42400978881939751258842156cecc10f0 Mon Sep 17 00:00:00 2001 From: Tess Neau Date: Thu, 26 Sep 2024 11:02:00 -0400 Subject: [PATCH] add parse_url dd grok filter --- src/datadog/grok/filters/mod.rs | 1 + src/datadog/grok/filters/url.rs | 150 ++++++++++++++++++++++++++++++++ src/datadog/grok/grok_filter.rs | 8 +- src/datadog/grok/parse_grok.rs | 24 +++++ 4 files changed, 182 insertions(+), 1 deletion(-) create mode 100644 src/datadog/grok/filters/url.rs diff --git a/src/datadog/grok/filters/mod.rs b/src/datadog/grok/filters/mod.rs index 1a889e4fe6..87b7612fec 100644 --- a/src/datadog/grok/filters/mod.rs +++ b/src/datadog/grok/filters/mod.rs @@ -1,2 +1,3 @@ pub mod array; pub mod keyvalue; +pub mod url; diff --git a/src/datadog/grok/filters/url.rs b/src/datadog/grok/filters/url.rs new file mode 100644 index 0000000000..518117269d --- /dev/null +++ b/src/datadog/grok/filters/url.rs @@ -0,0 +1,150 @@ +use crate::compiler::prelude::*; +use percent_encoding::{utf8_percent_encode, NON_ALPHANUMERIC}; +use std::collections::BTreeMap; +use url::Url; + +pub fn parse_url(input: &str) -> Resolved { + Url::parse(input) + .map_err(|e| format!("unable to parse url: {e}").into()) + .map(|url| url_to_dd_value(url)) +} + +fn url_to_dd_value(url: Url) -> Value { + let mut map = BTreeMap::<&str, Value>::new(); + + map.insert("scheme", url.scheme().into()); + map.insert("host", url.host_str().map(ToOwned::to_owned).into()); + map.insert("path", url.path().into()); + + if !url.username().is_empty() { + let mut auth_map = ObjectMap::new(); + auth_map.insert( + KeyString::from("username"), + url.username().to_owned().into(), + ); + + if let Some(password) = url.password() { + auth_map.insert(KeyString::from("password"), password.to_owned().into()); + } + + map.insert("auth", Value::Object(auth_map)); + } + + if let Some(port) = url.port() { + map.insert("port", port.into()); + }; + + if let Some(hash) = url.fragment() { + map.insert("hash", hash.to_owned().into()); + } + + let query_pairs: ObjectMap = url + .query_pairs() + .into_owned() + .map(|(k, v)| { + ( + k.into(), + utf8_percent_encode(&v, NON_ALPHANUMERIC).to_string().into(), + ) + }) + .collect::(); + + if !query_pairs.is_empty() { + let query_string: ObjectMap = query_pairs + .into_iter() + .map(|(k, v)| (KeyString::from(k), Value::from(v))) + .collect(); + map.insert("queryString", query_string.into()); + } + + map.into_iter() + .map(|(k, v)| (k.to_owned(), v)) + .collect::() +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::btreemap; + + #[test] + fn test_parses_simple_url() { + let result = parse_url("https://vector.dev/".into()).unwrap(); + assert_eq!( + result, + Value::from(btreemap! { + "scheme" => "https", + "host" => "vector.dev", + "path" => "/" + }) + ); + } + + #[test] + fn test_parses_url_with_query_strings() { + let result = parse_url( + "https://help.datadoghq.com/hc/en-us/search?utf8=%E2%9C%93&query=install&commit=Search" + .into(), + ) + .unwrap(); + assert_eq!( + result, + Value::from(btreemap! { + "scheme" => "https", + "host" => "help.datadoghq.com", + "path" => "/hc/en-us/search", + "queryString" => btreemap!{ + "utf8" => "%E2%9C%93", + "query" => "install", + "commit" => "Search" + }, + }) + ); + } + + #[test] + fn test_parses_complex_url() { + let result = parse_url("https://user:password@api.logmatic.io:8080/a/long/path/file.txt?debug¶m1=foo¶m2=bar#!/super/hash".into()).unwrap(); + assert_eq!( + result, + Value::from(btreemap! { + "scheme" => "https", + "host" => "api.logmatic.io", + "port" => 8080, + "path" => "/a/long/path/file.txt", + "queryString" => btreemap! { + "debug" => "", + "param1" => "foo", + "param2" => "bar" + }, + "auth" => btreemap! { + "username" => "user", + "password" => "password" + }, + "hash" => "!/super/hash" + }) + ); + } + + // Url::parse only works on absolute URLs (at least scheme + host) + // Diff with the logs implementation, which is able to parse relative URLs + #[test] + fn test_parse_err_relative_url() { + let result = parse_url("/youpi1/youpi2/img.jpg?q=my%20query#configure/input".into()); + assert!(result.is_err()); + } + + // Diff with the logs implementation, which returns an empty string for path + #[test] + fn test_parse_no_path() { + let result = parse_url("http://j.mp".into()).unwrap(); + assert_eq!( + result, + Value::from(btreemap! { + "scheme" => "http", + "host" => "j.mp", + "path" => "/" + }) + ); + } +} diff --git a/src/datadog/grok/grok_filter.rs b/src/datadog/grok/grok_filter.rs index 2b63daa5f5..a0178fe162 100644 --- a/src/datadog/grok/grok_filter.rs +++ b/src/datadog/grok/grok_filter.rs @@ -10,7 +10,7 @@ use percent_encoding::percent_decode; use super::{ ast::{Function, FunctionArgument}, - filters::{array, keyvalue, keyvalue::KeyValueFilter}, + filters::{array, keyvalue, keyvalue::KeyValueFilter, url}, matchers::date::{apply_date_filter, DateFilter}, parse_grok::Error as GrokRuntimeError, parse_grok_rules::Error as GrokStaticError, @@ -41,6 +41,7 @@ pub enum GrokFilter { Box>, ), KeyValue(KeyValueFilter), + Url, } impl fmt::Display for GrokFilter { @@ -63,6 +64,7 @@ impl fmt::Display for GrokFilter { GrokFilter::Xml => f.pad("Xml"), GrokFilter::Array(..) => f.pad("Array(..)"), GrokFilter::KeyValue(..) => f.pad("KeyValue(..)"), + GrokFilter::Url => f.pad("Url"), } } } @@ -112,6 +114,7 @@ impl TryFrom<&Function> for GrokFilter { .ok_or_else(|| GrokStaticError::InvalidFunctionArguments(f.name.clone())), "array" => array::filter_from_function(f), "keyvalue" => keyvalue::filter_from_function(f), + "url" => Ok(GrokFilter::Url), _ => Err(GrokStaticError::UnknownFilter(f.name.clone())), } } @@ -267,6 +270,9 @@ pub fn apply_filter(value: &Value, filter: &GrokFilter) -> Result parse_value_error_prone(value, filter, |b| { + url::parse_url(String::from_utf8_lossy(b).as_ref()) + }), } } diff --git a/src/datadog/grok/parse_grok.rs b/src/datadog/grok/parse_grok.rs index 1b427e1f4a..3102b36e50 100644 --- a/src/datadog/grok/parse_grok.rs +++ b/src/datadog/grok/parse_grok.rs @@ -1209,6 +1209,30 @@ mod tests { )]); } + #[test] + fn supports_url_filter() { + test_grok_pattern(vec![( + "%{data:field:url}", + "https://user:password@api.logmatic.io:8080/a/long/path/file.txt?debug¶m1=foo¶m2=bar#!/super/hash", + Ok(Value::from(btreemap! { + "scheme" => "https", + "host" => "api.logmatic.io", + "port" => 8080, + "path" => "/a/long/path/file.txt", + "queryString" => btreemap! { + "debug" => "", + "param1" => "foo", + "param2" => "bar" + }, + "auth" => btreemap! { + "username" => "user", + "password" => "password" + }, + "hash" => "!/super/hash" + })), + )]); + } + #[test] fn parses_sample() { test_full_grok(vec![(