diff --git a/.envrc b/.envrc new file mode 100644 index 0000000..3b588d7 --- /dev/null +++ b/.envrc @@ -0,0 +1,3 @@ +if has guix; then + use guix erlang elixir elixir-hex just +fi diff --git a/.tool-versions b/.tool-versions deleted file mode 100644 index 52d11a3..0000000 --- a/.tool-versions +++ /dev/null @@ -1 +0,0 @@ -elixir 1.10 diff --git a/CHANGELOG.md b/CHANGELOG.md index bc6bfeb..e7e11c2 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,9 @@ # Changelog +## v.0.6.0 + +Forked from Furlex with many changes/additions - changelog TODO + ## v.0.5.0 - Updates Floki ([queer](https://github.com/queer)) @@ -41,12 +45,12 @@ ## v.0.2.2 -- Furlex now supports passing HTTP options to Furlex.unfurl/2. +- Unfurl now supports passing HTTP options to Unfurl.unfurl/2. - `:depth` config has been transformed to a `:group_keys?` boolean. ## v.0.2.1 -- Add status code to %Furlex{} structure (thanks [abitdodgy](https://github.com/abitdodgy)) +- Add status code to Unfurl's return (thanks [abitdodgy](https://github.com/abitdodgy)) - Fix compatibility with Phoenix 1.3 (thanks, again, [abitdodgy](https://github.com/abitdodgy)!) ## v.0.2.0 diff --git a/LICENSE.md b/LICENSE.md index 77061d7..d63d841 100644 --- a/LICENSE.md +++ b/LICENSE.md @@ -1,3 +1,4 @@ +Copyright 2020 Bonfire Networks Copyright 2017 Clayton Gentry Licensed under the Apache License, Version 2.0 (the "License"); diff --git a/README.md b/README.md index c9b47a5..bf4acd8 100644 --- a/README.md +++ b/README.md @@ -1,127 +1,163 @@ -# Furlex +# Unfurl -Furlex is a [structured data](https://moz.com/learn/seo/schema-structured-data) extraction tool written in Elixir. +Unfurl is a [structured data](https://moz.com/learn/seo/schema-structured-data) extraction tool written in Elixir. -It currently supports unfurling oEmbed, Twitter Card, Facebook Open Graph, JSON-LD -and plain ole' HTML `` data out of any url you supply. +It currently supports unfurling oEmbed, Open Graph (Facebook), Twitter Card, JSON-LD, rel-me, favicons, and plain ole' HTML `` data out of any url you supply. ## Installation -Add `:furlex` to your list of dependencies in `mix.exs`: +Add `:unfurl` to your list of dependencies in `mix.exs`:O ```elixir def deps do - [{:furlex, "~> 0.5.0"}] + [{:unfurl, "~> 0.6.0"}] end ``` -Then run `$ mix deps.get`. Also add `:furlex` to your applications list: +Then run `$ mix deps.get`. Also add `:unfurl` to your applications list: ```elixir def application do - [applications: [:furlex]] + [applications: [:unfurl]] end ``` -[Jason](http://github.com/michalmuskala/jason) is the default json library in Furlex. You can however configure Furlex to use another library. For example: +[Jason](http://github.com/michalmuskala/jason) is the default json library in Unfurl. You can however configure Unfurl to use another library. For example: ```elixir -config :furlex, :json_library, YourLibraryOfChoice +config :unfurl, :json_library, YourLibraryOfChoice ``` ## Usage -To unfurl a url, simply pass it to `Furlex.unfurl/1` +To unfurl a url, simply pass it to `Unfurl.unfurl/1` ```elixir -iex(1)> Furlex.unfurl "https://www.youtube.com/watch?v=Gh6H7Md_L2k" +iex(1)> Unfurl.unfurl "https://www.youtube.com/watch?v=Gh6H7Md_L2k" {:ok, - %Furlex{canonical_url: "https://www.youtube.com/watch?v=Gh6H7Md_L2k", - facebook: %{"fb:app_id" => "87741124305", - "og:description" => "Watch the full episode: https://www.thisoldhouse.com/watch/ask-toh-future-house-offerman Ask This Old House host Kevin O’Connor visits Nick Offerman in Los A...", - "og:image" => "https://i.ytimg.com/vi/Gh6H7Md_L2k/maxresdefault.jpg", - "og:site_name" => "YouTube", - "og:title" => "Touring Nick Offerman’s Wood Shop", "og:type" => "video", - "og:url" => "https://www.youtube.com/watch?v=Gh6H7Md_L2k", - "og:video:height" => ["720", "720"], - "og:video:secure_url" => ["https://www.youtube.com/embed/Gh6H7Md_L2k", - "https://www.youtube.com/v/Gh6H7Md_L2k?version=3&autohide=1"], - "og:video:type" => ["text/html", "application/x-shockwave-flash"], - "og:video:url" => ["https://www.youtube.com/embed/Gh6H7Md_L2k", - "http://www.youtube.com/v/Gh6H7Md_L2k?version=3&autohide=1"], - "og:video:width" => ["1280", "1280"]}, - json_ld: [%{"@context" => "http://schema.org", "@type" => "BreadcrumbList", - "itemListElement" => [%{"@type" => "ListItem", - "item" => %{"@id" => "http://www.youtube.com/user/thisoldhouse", - "name" => "This Old House"}, "position" => 1}]}], - oembed: %{"author_name" => "This Old House", - "author_url" => "https://www.youtube.com/user/thisoldhouse", - "height" => 270, - "html" => "", - "provider_name" => "YouTube", "provider_url" => "https://www.youtube.com/", - "thumbnail_height" => 360, - "thumbnail_url" => "https://i.ytimg.com/vi/Gh6H7Md_L2k/hqdefault.jpg", - "thumbnail_width" => 480, "title" => "Touring Nick Offerman’s Wood Shop", - "type" => "video", "version" => "1.0", "width" => 480}, - other: %{"description" => "Watch the full episode: https://www.thisoldhouse.com/watch/ask-toh-future-house-offerman Ask This Old House host Kevin O’Connor visits Nick Offerman in Los A...", - "keywords" => "this old house, how-to, home improvement, Episode, TV Show, DIY, Ask This Old House, Nick Offerman, Kevin O'Connor, woodworking, wood shop", - "theme-color" => "#ff0000", - "title" => "Touring Nick Offerman’s Wood Shop"}, - status_code: 200, - twitter: %{"twitter:app:id:googleplay" => "com.google.android.youtube", - "twitter:app:id:ipad" => "544007664", - "twitter:app:id:iphone" => "544007664", - "twitter:app:name:googleplay" => "YouTube", - "twitter:app:name:ipad" => "YouTube", - "twitter:app:name:iphone" => "YouTube", - "twitter:app:url:googleplay" => "https://www.youtube.com/watch?v=Gh6H7Md_L2k", - "twitter:app:url:ipad" => "vnd.youtube://www.youtube.com/watch?v=Gh6H7Md_L2k&feature=applinks", - "twitter:app:url:iphone" => "vnd.youtube://www.youtube.com/watch?v=Gh6H7Md_L2k&feature=applinks", - "twitter:card" => "player", - "twitter:description" => "Watch the full episode: https://www.thisoldhouse.com/watch/ask-toh-future-house-offerman Ask This Old House host Kevin O’Connor visits Nick Offerman in Los A...", - "twitter:image" => "https://i.ytimg.com/vi/Gh6H7Md_L2k/maxresdefault.jpg", - "twitter:player" => "https://www.youtube.com/embed/Gh6H7Md_L2k", - "twitter:player:height" => "720", "twitter:player:width" => "1280", - "twitter:site" => "@youtube", - "twitter:title" => "Touring Nick Offerman’s Wood Shop", - "twitter:url" => "https://www.youtube.com/watch?v=Gh6H7Md_L2k"}}} + %{ + other: %{ + "description" => "Ask This Old House host Kevin O’Connor visits Nick Offerman in Los Angeles to tour the comedian’s woodworking shop.SUBSCRIBE to This Old House: http://bit.ly...", + "keywords" => "this old house, how-to, home improvement, Episode, TV Show, DIY, Ask This Old House, Nick Offerman, Kevin O'Connor, woodworking, wood shop, Los Angeles, Comedian, This Old House, Home Improvement, DIY Ideas, Renovation, Renovation Ideas, How To Fix, How To Install, How To Build, Kevin o’connor, kevin o'connor house, kevin o'connor this old house, kevin o'connor ask this old house, kevin o'connor interview", + "theme-color" => "rgba(255, 255, 255, 0.98)", + "title" => ["Touring Nick Offerman’s Wood Shop | Ask This Old House", + "Touring Nick Offerman’s Wood Shop | Ask This Old House - YouTube"] + }, + canonical_url: nil, + facebook: %{ + "description" => "Ask This Old House host Kevin O’Connor visits Nick Offerman in Los Angeles to tour the comedian’s woodworking shop.SUBSCRIBE to This Old House: http://bit.ly...", + "fb" => %{"app_id" => "87741124305"}, + "image" => %{"height" => "720", "width" => "1280"}, + "site_name" => "YouTube", + "title" => "Touring Nick Offerman’s Wood Shop | Ask This Old House", + "type" => "video.other", + "url" => "https://www.youtube.com/watch?v=Gh6H7Md_L2k", + "video" => %{ + "height" => "720", + "secure_url" => "https://www.youtube.com/embed/Gh6H7Md_L2k", + "type" => "text/html", + "url" => "https://www.youtube.com/embed/Gh6H7Md_L2k", + "width" => "1280" + } + }, + twitter: %{ + "app" => %{ + "id" => %{ + "googleplay" => "com.google.android.youtube", + "ipad" => "544007664", + "iphone" => "544007664" + }, + "name" => %{ + "googleplay" => "YouTube", + "ipad" => "YouTube", + "iphone" => "YouTube" + }, + "url" => %{ + "googleplay" => "https://www.youtube.com/watch?v=Gh6H7Md_L2k", + "ipad" => "vnd.youtube://www.youtube.com/watch?v=Gh6H7Md_L2k&feature=applinks", + "iphone" => "vnd.youtube://www.youtube.com/watch?v=Gh6H7Md_L2k&feature=applinks" + } + }, + "card" => "player", + "description" => "Ask This Old House host Kevin O’Connor visits Nick Offerman in Los Angeles to tour the comedian’s woodworking shop.SUBSCRIBE to This Old House: http://bit.ly...", + "image" => "https://i.ytimg.com/vi/Gh6H7Md_L2k/maxresdefault.jpg", + "player" => %{"height" => "720", "width" => "1280"}, + "site" => "@youtube", + "title" => "Touring Nick Offerman’s Wood Shop | Ask This Old House", + "url" => "https://www.youtube.com/watch?v=Gh6H7Md_L2k" + }, + oembed: %{ + "author_name" => "This Old House", + "author_url" => "https://www.youtube.com/@thisoldhouse", + "height" => 113, + "html" => "", + "provider_name" => "YouTube", + "provider_url" => "https://www.youtube.com/", + "thumbnail_height" => 360, + "thumbnail_url" => "https://i.ytimg.com/vi/Gh6H7Md_L2k/hqdefault.jpg", + "thumbnail_width" => 480, + "title" => "Touring Nick Offerman’s Wood Shop | Ask This Old House", + "type" => "video", + "version" => "1.0", + "width" => 200 + }, + json_ld: [ + %{ + "@context" => "http://schema.org", + "@type" => "BreadcrumbList", + "itemListElement" => [ + %{ + "@type" => "ListItem", + "item" => %{ + "@id" => "http://www.youtube.com/@thisoldhouse", + "name" => "This Old House" + }, + "position" => 1 + } + ] + } + ], + status_code: 200, + rel_me: nil, + favicon: "https://www.youtube.com/s/desktop/ef8ce500/img/favicon_32x32.png" + }} ``` ## Configuration -Furlex accepts a few optional configuration parameters. +Unfurl accepts a few optional configuration parameters. You may configure additional tags to capture under the Facebook OpenGraph and TwitterCard parsers. ```elixir -config :furlex, Furlex.Parser.Facebook, +config :unfurl, Unfurl.Parser.Facebook, tags: ~w(my:custom:facebook:tag another:custom:facebook:tag) -config :furlex, Furlex.Parser.Twitter, +config :unfurl, Unfurl.Parser.Twitter, tags: ~w(my:custom:twitter:tag) ``` -You may also configure the depth of the resulting Furlex map with a `:group_keys?` boolean. +You may also configure the depth of the resulting Unfurl map with a `:group_keys?` boolean. ```elixir -config :furlex, group_keys?: true +config :unfurl, group_keys?: true ``` -If this option is set to false or unconfigured, Furlex will return values mapped directly beneath OpenGraph and TwitterCard keys, i.e. +If this option is set to false or unconfigured, Unfurl will return values mapped directly beneath OpenGraph and TwitterCard keys, i.e. ```elixir -%Furlex{twitter: %{ +%{twitter: %{ "twitter:app:id:googleplay" => "com.google.android.youtube", "twitter:app:id:ipad" => "544007664", "twitter:app:id:iphone" => "544007664" }} ``` -If true, Furlex will return values grouped into colon-delimited map structures, i.e. +If true, Unfurl will return values grouped into colon-delimited map structures, i.e. ```elixir -%Furlex{twitter: %{ +%{twitter: %{ "twitter" => %{ "app" => %{ "id" => %{ @@ -136,7 +172,8 @@ If true, Furlex will return values grouped into colon-delimited map structures, ## License -Copyright 2017 Clayton Gentry +Copyright 2020 Bonfire Networks +Copyright 2017 Clayton Gentry (author of https://www.hex.pm/packages/furlex which Unfurl was forked from) Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/benchmark.exs b/benchmark.exs index 34c842c..4b340ad 100644 --- a/benchmark.exs +++ b/benchmark.exs @@ -4,8 +4,8 @@ vimeo = File.read! "./test/fixtures/vimeo.html" Benchee.run(%{ - "facebook" => fn -> Furlex.Parser.Facebook.parse(vimeo) end, - "twitter" => fn -> Furlex.Parser.Twitter.parse(vimeo) end, - "json_ld" => fn -> Furlex.Parser.JsonLD.parse(vimeo) end, - "html" => fn -> Furlex.Parser.HTML.parse(vimeo) end + "facebook" => fn -> Unfurl.Parser.Facebook.parse(vimeo) end, + "twitter" => fn -> Unfurl.Parser.Twitter.parse(vimeo) end, + "json_ld" => fn -> Unfurl.Parser.JsonLD.parse(vimeo) end, + "html" => fn -> Unfurl.Parser.HTML.parse(vimeo) end }) diff --git a/config/config.exs b/config/config.exs index 76f125b..f15ccaa 100644 --- a/config/config.exs +++ b/config/config.exs @@ -1,6 +1,13 @@ # This file is responsible for configuring your application # and its dependencies with the aid of the Mix.Config module. -use Mix.Config +import Config + +config :tesla, adapter: {Tesla.Adapter.Hackney, [recv_timeout: 1_000]} + +config :bypass, enable_debug_log: true + +config :logger, + truncate: :infinity # This configuration is loaded before any dependency and is restricted # to this project. If another project depends on this project, this @@ -10,11 +17,11 @@ use Mix.Config # You can configure for your application as: # -# config :furlex, key: :value +# config :unfurl, key: :value # # And access this configuration in your application as: # -# Application.get_env(:furlex, :key) +# Application.get_env(:unfurl, :key) # # Or configure a 3rd-party app: # diff --git a/lib/fetcher/fetcher.ex b/lib/fetcher/fetcher.ex new file mode 100644 index 0000000..e4ccdcc --- /dev/null +++ b/lib/fetcher/fetcher.ex @@ -0,0 +1,55 @@ +defmodule Unfurl.Fetcher do + @moduledoc """ + A module for fetching body data for a given url + """ + use Tesla + + plug Tesla.Middleware.Telemetry + + plug Unfurl.Tesla.Middleware.MaybeFollowRedirects, max_redirects: 5 + + import Untangle + + @doc """ + Fetches a url and extracts the body + """ + @spec fetch(String.t(), List.t()) :: {:ok, String.t(), Integer.t()} | {:error, Atom.t()} + def fetch(url, opts \\ []) + + def fetch(url, opts) when is_binary(url) do + URI.parse(url) + |> fetch(opts) + end + + def fetch(%URI{} = url, opts) do + case url do + %URI{host: nil, path: nil} -> + error(url, "Tried to fetch an invalid URL") + + %URI{scheme: "doi"} -> + error(url, "Tried to fetch an invalid URL") + + %URI{scheme: nil, host: nil, path: host_detected_as_path} -> + do_fetch("http://#{host_detected_as_path}", opts) + + %URI{} -> + do_fetch(to_string(url), opts) + end + end + + defp do_fetch(url, opts \\ []) when is_binary(url) do + case get(url, opts) do + {:ok, %{body: body, status: status_code}} -> {:ok, body, status_code} + other -> other + end + rescue + e in Tesla.Mock.Error -> + error(e) + + e in ArgumentError -> + error(e) + + e in CaseClauseError -> + error(e) + end +end diff --git a/lib/fetcher/maybe_follow_redirects.ex b/lib/fetcher/maybe_follow_redirects.ex new file mode 100644 index 0000000..ce1e1db --- /dev/null +++ b/lib/fetcher/maybe_follow_redirects.ex @@ -0,0 +1,115 @@ +defmodule Unfurl.Tesla.Middleware.MaybeFollowRedirects do + @moduledoc """ + Follow HTTP 3xx redirects, based on `Tesla.Middleware.FollowRedirects` with some changes + + ## Examples + + ```elixir + defmodule MyClient do + def client do + # defaults to 5 + Tesla.client([ + {Tesla.Middleware.FollowRedirects, max_redirects: 3} + ]) + end + end + ``` + + ## Options + + - `:max_redirects` - limit number of redirects (default: `5`) + """ + + @behaviour Tesla.Middleware + + @max_redirects 5 + @redirect_statuses [301, 302, 303, 307, 308] + + import Untangle + + @impl Tesla.Middleware + def call(env, next, opts \\ []) do + max = Keyword.get(opts || [], :max_redirects, @max_redirects) + + redirect(env, next, max) + end + + defp redirect(env, next, left) when left == 0 do + case Tesla.run(env, next) do + {:ok, %{status: status} = res} when status not in @redirect_statuses -> + {:ok, res} + + {:ok, res} -> + warn("Too many redirects, return the last response") + {:ok, res} + + error -> + error + end + end + + defp redirect(env, next, left) do + case Tesla.run(env, next) do + {:ok, %{status: status} = res} when status in @redirect_statuses -> + case Tesla.get_header(res, "location") do + nil -> + {:ok, res} + + location -> + if location in Application.get_env(:unfurl, :ignore_redirect_urls, []) do + warn( + res, + "Ignoring redirect to location in :ignore_redirect_urls config and returning previous response" + ) + + {:ok, env} + else + prev_uri = URI.parse(env.url) + next_uri = parse_location(location, res) + + # Copy opts and query params from the response env, + # these are not modified in the adapters, but middlewares + # that come after might store state there + env = %{env | opts: res.opts} + + env + |> filter_headers(prev_uri, next_uri) + |> new_request(status, URI.to_string(next_uri)) + |> redirect(next, left - 1) + end + end + + other -> + other + end + end + + # The 303 (See Other) redirect was added in HTTP/1.1 to indicate that the originally + # requested resource is not available, however a related resource (or another redirect) + # available via GET is available at the specified location. + # https://tools.ietf.org/html/rfc7231#section-6.4.4 + defp new_request(env, 303, location), do: %{env | url: location, method: :get, query: []} + + # The 307 (Temporary Redirect) status code indicates that the target + # resource resides temporarily under a different URI and the user agent + # MUST NOT change the request method (...) + # https://tools.ietf.org/html/rfc7231#section-6.4.7 + defp new_request(env, 307, location), do: %{env | url: location} + + defp new_request(env, _, location), do: %{env | url: location, query: []} + + defp parse_location("https://" <> _rest = location, _env), do: URI.parse(location) + defp parse_location("http://" <> _rest = location, _env), do: URI.parse(location) + defp parse_location(location, env), do: env.url |> URI.parse() |> URI.merge(location) + + # See https://github.com/teamon/tesla/issues/362 + # See https://github.com/teamon/tesla/issues/360 + @filter_headers ["authorization", "host"] + defp filter_headers(env, prev, next) do + if next.host != prev.host || next.port != prev.port || next.scheme != prev.scheme do + %{env | headers: Enum.filter(env.headers, fn {k, _} -> k not in @filter_headers end)} + else + env + end + end +end diff --git a/lib/furlex.ex b/lib/furlex.ex deleted file mode 100644 index ac45099..0000000 --- a/lib/furlex.ex +++ /dev/null @@ -1,104 +0,0 @@ -defmodule Furlex do - @moduledoc """ - Furlex is a structured data extraction tool written in Elixir. - - It currently supports unfurling oEmbed, Twitter Card, Facebook Open Graph, - JSON-LD and plain ole' HTML `` data out of any url you supply. - """ - - use Application - - alias Furlex.{Fetcher, Parser} - alias Furlex.Parser.{Facebook, HTML, JsonLD, Twitter} - - defstruct [ - :canonical_url, - :oembed, - :facebook, - :twitter, - :json_ld, - :other, - :status_code - ] - - @type t :: %__MODULE__{ - canonical_url: String.t(), - oembed: nil | Map.t(), - facebook: Map.t(), - twitter: Map.t(), - json_ld: List.t(), - other: Map.t(), - status_code: Integer.t() - } - - @doc false - def start(_type, _args) do - opts = [strategy: :one_for_one, name: Furlex.Supervisor] - - children = [ - Furlex.Oembed - ] - - Supervisor.start_link(children, opts) - end - - @doc """ - Unfurls a url - - unfurl/1 fetches oembed data if applicable to the given url's host, - in addition to Twitter Card, Open Graph, JSON-LD and other HTML meta tags. - - unfurl/2 also accepts a keyword list that will be passed to HTTPoison. - """ - @spec unfurl(String.t(), Keyword.t()) :: {:ok, __MODULE__.t()} | {:error, Atom.t()} - def unfurl(url, opts \\ []) do - with {:ok, {body, status_code}, oembed} <- fetch(url, opts), - {:ok, results} <- parse(body) do - {:ok, - %__MODULE__{ - canonical_url: Parser.extract_canonical(body), - oembed: oembed, - facebook: results.facebook, - twitter: results.twitter, - json_ld: results.json_ld, - other: results.other, - status_code: status_code - }} - end - end - - defp fetch(url, opts) do - fetch = Task.async(Fetcher, :fetch, [url, opts]) - fetch_oembed = Task.async(Fetcher, :fetch_oembed, [url, opts]) - yield = Task.yield_many([fetch, fetch_oembed]) - - with [fetch, fetch_oembed] <- yield, - {_fetch, {:ok, {:ok, body, status_code}}} <- fetch, - {_fetch_oembed, {:ok, {:ok, oembed}}} <- fetch_oembed do - {:ok, {body, status_code}, oembed} - else - _ -> {:error, :fetch_error} - end - end - - defp parse(body) do - parse = &Task.async(&1, :parse, [body]) - tasks = Enum.map([Facebook, Twitter, JsonLD, HTML], parse) - - with [facebook, twitter, json_ld, other] <- Task.yield_many(tasks), - {_facebook, {:ok, {:ok, facebook}}} <- facebook, - {_twitter, {:ok, {:ok, twitter}}} <- twitter, - {_json_ld, {:ok, {:ok, json_ld}}} <- json_ld, - {_other, {:ok, {:ok, other}}} <- other do - {:ok, - %{ - facebook: facebook, - twitter: twitter, - json_ld: json_ld, - other: other - }} - else - _ -> {:error, :parse_error} - end - end -end diff --git a/lib/furlex/fetcher.ex b/lib/furlex/fetcher.ex deleted file mode 100644 index 58e90b6..0000000 --- a/lib/furlex/fetcher.ex +++ /dev/null @@ -1,45 +0,0 @@ -defmodule Furlex.Fetcher do - @moduledoc """ - A module for fetching body data for a given url - """ - - require Logger - - alias Furlex.Oembed - - @json_library Application.get_env(:furlex, :json_library, Jason) - - @doc """ - Fetches a url and extracts the body - """ - @spec fetch(String.t(), List.t()) :: {:ok, String.t(), Integer.t()} | {:error, Atom.t()} - def fetch(url, opts \\ []) do - case HTTPoison.get(url, [], opts) do - {:ok, %{body: body, status_code: status_code}} -> {:ok, body, status_code} - other -> other - end - end - - @doc """ - Fetches oembed data for the given url - """ - @spec fetch_oembed(String.t(), List.t()) :: {:ok, String.t()} | {:ok, nil} | {:error, Atom.t()} - def fetch_oembed(url, opts \\ []) do - with {:ok, endpoint} <- Oembed.endpoint_from_url(url), - params = %{"url" => url}, - opts = Keyword.put(opts, :params, params), - {:ok, response} <- HTTPoison.get(endpoint, [], opts), - {:ok, body} <- @json_library.decode(response.body) do - {:ok, body} - else - {:error, :no_oembed_provider} -> - {:ok, nil} - - other -> - "Could not fetch oembed for #{inspect(url)}: #{inspect(other)}" - |> Logger.error() - - {:ok, nil} - end - end -end diff --git a/lib/furlex/oembed.ex b/lib/furlex/oembed.ex deleted file mode 100644 index a4f77c0..0000000 --- a/lib/furlex/oembed.ex +++ /dev/null @@ -1,131 +0,0 @@ -defmodule Furlex.Oembed do - @moduledoc """ - A module for managing oembed data - """ - - use GenServer - use HTTPoison.Base - - require Logger - - @json_library Application.get_env(:furlex, :json_library, Jason) - - @doc """ - Fetches the list of Oembed providers - - Soft fetch will fetch cached providers. Hard fetch requests - providers from oembed.com and purges the cache. - """ - @spec fetch_providers(Atom.t()) :: {:ok, List.t()} | {:error, Atom.t()} - def fetch_providers(type \\ :soft) - - def fetch_providers(:hard) do - case get("/providers.json") do - {:ok, %{body: providers}} -> - GenServer.cast(__MODULE__, {:providers, providers}) - {:ok, providers} - - other -> - Logger.error("Could not fetch providers: #{inspect(other)}") - {:error, :fetch_error} - end - end - - def fetch_providers(_soft) do - case GenServer.call(__MODULE__, :providers) do - nil -> fetch_providers(:hard) - providers -> {:ok, providers} - end - end - - @doc """ - Returns an Oembed endpoint for the given url - - ## Examples - - iex> Oembed.endpoint_from_url "https://vimeo.com/88856141" - {:ok, "https://vimeo.com/api/oembed.json"} - - iex> Oembed.endpoint_from_url "https://vimeo.com/88856141", %{"format" => "xml"} - {:ok, "https://vimeo.com/api/oembed.xml"} - """ - @spec endpoint_from_url(String.t(), Map.t()) :: {:ok, String.t()} | {:error, Atom.t()} - def endpoint_from_url(url, params \\ %{"format" => "json"}, opts \\ []) do - case provider_from_url(url, opts) do - nil -> - {:error, :no_oembed_provider} - - provider -> - endpoint_from_provider(provider, params) - end - end - - # Maps a url to a provider, or returns nil if no such provider exists - defp provider_from_url(url, opts) do - fetch_type = if Keyword.get(opts, :skip_cache?, false), do: :hard, else: :soft - - {:ok, providers} = fetch_providers(fetch_type) - - case URI.parse(url) do - %URI{host: nil} -> - nil - - %URI{host: host} -> - Enum.find(providers, &host_matches?(host, &1)) - end - end - - defp endpoint_from_provider(provider, params) do - [endpoint | _] = provider["endpoints"] - - url = endpoint["url"] - regex = ~r/{(.*?)}/ - url = Regex.replace(regex, url, fn _, key -> params[key] end) - - {:ok, url} - end - - defp host_matches?(host, %{"provider_url" => provider_url}) do - Regex.match?(~r/https?:\/\/#{host}/, provider_url) - end - - ## GenServer callbacks - - @doc false - def start_link(_) do - GenServer.start_link(__MODULE__, nil, name: __MODULE__) - end - - def init(state) do - {:ok, state} - end - - def handle_call(:providers, _from, state) do - {:reply, state, state} - end - - def handle_cast({:providers, providers}, _) do - {:noreply, providers} - end - - def process_url(path) do - oembed_host() <> path - end - - def process_response_body(body) do - case @json_library.decode(body) do - {:ok, body} -> body - _error -> body - end - end - - defp config(key) do - :furlex - |> Application.get_env(__MODULE__, []) - |> Keyword.get(key) - end - - defp oembed_host do - config(:oembed_host) || "https://oembed.com" - end -end diff --git a/lib/furlex/parser/json_ld.ex b/lib/furlex/parser/json_ld.ex deleted file mode 100644 index 2c8ef63..0000000 --- a/lib/furlex/parser/json_ld.ex +++ /dev/null @@ -1,33 +0,0 @@ -defmodule Furlex.Parser.JsonLD do - @behaviour Furlex.Parser - - @json_library Application.get_env(:furlex, :json_library, Jason) - - @spec parse(String.t()) :: nil | {:ok, List.t()} - def parse(html) do - meta = "script[type=\"application/ld+json\"]" - - html - |> Floki.parse_document() - |> elem(1) - |> Floki.find(meta) - |> case do - nil -> - {:ok, []} - - elements -> - json_ld = - elements - |> Enum.map(&decode/1) - |> List.flatten() - - {:ok, json_ld} - end - end - - defp decode(element) do - element - |> Floki.text(js: true) - |> @json_library.decode!() - end -end diff --git a/lib/oembed.ex b/lib/oembed.ex new file mode 100644 index 0000000..5583d83 --- /dev/null +++ b/lib/oembed.ex @@ -0,0 +1,315 @@ +defmodule Unfurl.Oembed do + @moduledoc """ + A module for managing oembed data + """ + + use GenServer + + alias Unfurl.Fetcher + + import Untangle + use Arrows + + @json_library Application.compile_env(:unfurl, :json_library, Jason) + + @doc """ + Fetches oembed data for the given url *if* it comes from a known provider + """ + @spec fetch(String.t(), List.t()) :: {:ok, String.t()} | {:ok, nil} | {:error, Atom.t()} + def fetch(url, _opts \\ []) do + detect_endpoint = endpoint_from_url(url) + + with {:ok, endpoint} <- detect_endpoint, + {:ok, data} <- do_fetch_from_endpoint(endpoint, url) do + {:ok, data} + else + {:error, :no_oembed_provider} -> + {:ok, nil} + + other -> + error( + other, + "Could not fetch oembed for: #{inspect(url)} - from endpoint: #{inspect(detect_endpoint)} - with error" + ) + + {:ok, nil} + end + end + + @doc """ + Looks for an oembed link in the HTML of the given url and fetches it + """ + def detect_and_fetch(url, html, _opts \\ []) do + with {:ok, endpoint} <- endpoint_from_html(html), + {:ok, data} <- do_fetch_from_endpoint(endpoint, url) do + data + else + {:error, :no_oembed_provider} -> + nil + + other -> + error(other, "Could not find an oembed for #{inspect(url)}") + + nil + end + end + + defp do_fetch_from_endpoint({mod, fun}, url) when is_atom(mod) and is_atom(fun) do + apply(mod, fun, [url]) + end + + defp do_fetch_from_endpoint(fun, url) when is_function(fun) do + fun.(url) + end + + defp do_fetch_from_endpoint(endpoint, _url) do + with {:ok, body, 200} <- Fetcher.fetch(endpoint), + {:ok, data} <- @json_library.decode(body) do + {:ok, %{oembed: data}} + end + end + + @doc """ + Fetches the list of Oembed providers + + Soft fetch will fetch cached providers. Hard fetch requests + providers from oembed.com and purges the cache. + """ + @spec fetch_providers(Atom.t()) :: {:ok, List.t()} | {:error, Atom.t()} + def fetch_providers(type \\ :soft) + + def fetch_providers(:hard) do + extra_providers = config(:extra_providers) || [] + + case Fetcher.fetch("https://oembed.com/providers.json") do + {:ok, providers, 200} -> + with {:ok, providers} when is_list(providers) <- Jason.decode(providers) do + providers = prepare_providers_regexes(providers ++ extra_providers) + info(providers, "Caching oembed providers") + GenServer.cast(__MODULE__, {:providers, providers}) + {:ok, providers} + else + error -> + error(error, "Could not parse oembed providers") + # {:error, :providers_parse_error} + {:ok, prepare_providers_regexes(extra_providers)} + end + + other -> + error(other, "Could not fetch oembed providers") + # {:error, :providers_fetch_error} + {:ok, prepare_providers_regexes(extra_providers)} + end + end + + def fetch_providers(_soft) do + case GenServer.call(__MODULE__, :providers) do + nil -> fetch_providers(:hard) + providers -> {:ok, providers} + end + end + + @doc """ + Returns an Oembed endpoint for the given url + + ## Examples + + iex> Oembed.endpoint_from_url "https://vimeo.com/88856141" + {:ok, "https://vimeo.com/api/oembed.json"} + + iex> Oembed.endpoint_from_url "https://vimeo.com/88856141", %{"format" => "xml"} + {:ok, "https://vimeo.com/api/oembed.xml"} + """ + @spec endpoint_from_url(String.t(), Map.t()) :: {:ok, String.t()} | {:error, Atom.t()} + def endpoint_from_url(url, params \\ %{"format" => "json"}, opts \\ []) do + case provider_from_url(url, opts) do + nil -> + {:error, :no_oembed_provider} + + provider -> + {:ok, endpoint_from_provider(provider, url, params) |> debug()} + end + end + + def endpoint_from_html(html) do + case parse_html_for_oembed(html) do + {_, provider} -> + {:ok, provider} + + _ -> + {:error, :no_oembed_provider} + end + end + + defp parse_html_for_oembed(html) when is_binary(html) do + doc = + html + |> Floki.parse_document() + |> elem(1) + + Unfurl.Parser.extract("application/json+oembed", doc, &"link[type=\"#{&1}\"]", "href") || + Unfurl.Parser.extract("text/xml+oembed", doc, &"link[type=\"#{&1}\"]", "href") + end + + defp parse_html_for_oembed(_), do: false + + # Maps a url to a provider, or returns nil if no such provider exists + defp provider_from_url(url, opts) do + fetch_type = if Keyword.get(opts, :skip_cache?, false), do: :hard, else: :soft + + {:ok, providers} = fetch_providers(fetch_type) + + case URI.parse(url) do + %URI{scheme: nil, host: nil, path: host_detected_as_path} + when is_binary(host_detected_as_path) -> + Enum.find(providers, &host_matches?(host_detected_as_path, &1)) + |> debug() + + %URI{host: host} when is_binary(host) -> + Enum.find(providers, &host_matches?(host, &1)) + |> debug() + + _ -> + nil + end || + Enum.find(providers, &an_endpoint_matches?(url, &1)) + |> debug() + end + + defp endpoint_from_provider(%{"fetch_function" => fetch_function} = _provider, _url, _params) + when is_function(fetch_function) do + fetch_function + end + + defp endpoint_from_provider(%{"fetch_function" => {mod, fun}} = _provider, _url, _params) do + {mod, fun} + end + + defp endpoint_from_provider(%{"endpoints" => endpoints} = _provider, url, params) do + [endpoint | _] = endpoints + # TODO: support multiple endpoints? + + endpoint_url = Regex.replace(~r/{(.*?)}/, endpoint["url"], fn _, key -> params[key] end) + + if endpoint["append_url"] do + "#{endpoint_url}#{url}" + else + URI.append_query(URI.parse(endpoint_url), URI.encode_query(%{"url" => url})) + end + end + + # defp host_matches?(host, %{"provider_url" => provider_url, "endpoints"=> endpoints}) when is_list(endpoints) do + # String.contains?(provider_url, host) or an_endpoint_matches?(host, endpoints) + # end + defp host_matches?(host, %{"provider_url" => provider_url}) do + String.contains?(provider_url, host) + end + + defp an_endpoint_matches?(url, %{"endpoints" => endpoints}) when is_list(endpoints) do + an_endpoint_matches?(url, endpoints) + end + + defp an_endpoint_matches?(url, endpoints) when is_list(endpoints) do + Enum.any?(endpoints, fn endpoint -> + endpoint + |> Map.get("schemes", []) + |> Enum.any?(fn + fun when is_function(fun, 1) -> + fun.(url) + |> debug(url) + + scheme -> + # with {:ok, regex} <- Regex.recompile(scheme) do + # Regex.match?(url, regex) + # |> debug("ran regex for provider") + # else + # e -> + # error(e, "Could not (re)compile regex for provider: #{scheme}") + String.match?(url, scheme) + # end + end) + end) + end + + defp an_endpoint_matches?(_url, _) do + nil + end + + defp prepare_providers_regexes(providers) when is_list(providers) or is_map(providers) do + Enum.map(providers, &prepare_provider_regexes/1) + end + + defp prepare_provider_regexes(%{"endpoints" => endpoints} = provider) + when is_list(endpoints) and endpoints != [] do + Enum.map(endpoints, &prepare_endpoint_regexes/1) + |> Map.put(provider, "endpoints", ...) + end + + defp prepare_provider_regexes(provider) do + provider + end + + defp prepare_endpoint_regexes(%{"schemes" => schemes} = endpoint) + when is_list(schemes) and schemes != [] do + Enum.map(schemes, &prepare_scheme_regex/1) + |> Map.put(endpoint, "schemes", ...) + end + + defp prepare_endpoint_regexes(endpoint) do + endpoint + end + + defp prepare_scheme_regex(scheme) when is_binary(scheme) do + with {:ok, regex} <- + scheme + |> String.replace("*", "[^/]+") + |> String.replace(".", "\.") + |> String.replace("http:", "#https?:") + |> String.replace("https:", "#https?:") + |> Regex.compile() + |> debug(scheme) do + regex + else + e -> + error(e) + scheme + end + end + + defp prepare_scheme_regex(scheme) do + scheme + end + + ## GenServer callbacks + + @doc false + def start_link(_) do + GenServer.start_link(__MODULE__, nil, name: __MODULE__) + end + + def init(state) do + {:ok, state} + end + + def handle_call(:providers, _from, state) do + {:reply, state, state} + end + + def handle_cast({:providers, providers}, _) do + {:noreply, providers} + end + + def process_url(path) do + oembed_host() <> path + end + + defp config(key) do + :unfurl + |> Application.get_env(__MODULE__, []) + |> Keyword.get(key) + end + + defp oembed_host do + config(:oembed_host) || "https://oembed.com" + end +end diff --git a/lib/furlex/parser/facebook.ex b/lib/parser/facebook.ex similarity index 73% rename from lib/furlex/parser/facebook.ex rename to lib/parser/facebook.ex index c91e511..1ab3f13 100644 --- a/lib/furlex/parser/facebook.ex +++ b/lib/parser/facebook.ex @@ -1,7 +1,8 @@ -defmodule Furlex.Parser.Facebook do - @behaviour Furlex.Parser +defmodule Unfurl.Parser.Facebook do + use Arrows + @behaviour Unfurl.Parser - alias Furlex.Parser + alias Unfurl.Parser @tags ~w( fb:app_id fb:pages @@ -28,11 +29,17 @@ defmodule Furlex.Parser.Facebook do ) @spec parse(String.t()) :: {:ok, Map.t()} - def parse(html) do + def parse(html, _opts \\ []) + def parse(html, _opts) when is_binary(html) do + html + |> Floki.parse_document() + ~> parse() + end + def parse(html, _opts) do meta = &"meta[property=\"#{&1}\"]" map = Parser.extract(tags(), html, meta) - {:ok, map} + {:ok, Map.merge(map, Map.get(map, "og", %{})) |> Map.drop(["og"])} end def tags do @@ -41,5 +48,5 @@ defmodule Furlex.Parser.Facebook do |> Enum.uniq() end - defp config(key), do: Application.get_env(:furlex, __MODULE__)[key] + defp config(key), do: Application.get_env(:unfurl, __MODULE__)[key] end diff --git a/lib/furlex/parser/html.ex b/lib/parser/html.ex similarity index 60% rename from lib/furlex/parser/html.ex rename to lib/parser/html.ex index ca25718..86f641b 100644 --- a/lib/furlex/parser/html.ex +++ b/lib/parser/html.ex @@ -1,30 +1,48 @@ -defmodule Furlex.Parser.HTML do - @behaviour Furlex.Parser +defmodule Unfurl.Parser.HTML do + use Arrows + @behaviour Unfurl.Parser - alias Furlex.Parser.{Facebook, Twitter} + alias Unfurl.Parser.{Facebook, Twitter} @spec parse(String.t()) :: nil | {:ok, Map.t()} - def parse(html) do + def parse(html, _opts \\ []) + def parse(html, _opts) when is_binary(html) do html |> Floki.parse_document() - |> elem(1) + ~> parse() + end + def parse(html, _opts) do + result = get_title(html) + + html |> Floki.find("meta[name]") |> case do nil -> - {:ok, %{}} + {:ok, result} elements -> - content = - elements - |> filter_other() - |> Enum.reduce(%{}, &to_map/2) + {:ok, + elements + |> filter_meta() + |> Enum.reduce(result, &to_map/2)} + end + end + + defp get_title(html) do + case Floki.find(html, "title") do + nil -> + %{} - {:ok, content} + title -> + case Floki.text(title, deep: false) do + "" -> %{} + title -> %{"title" => title} + end end end # Filter out plain meta elements from Twitter, Facebook, etc. - defp filter_other(elements) do + defp filter_meta(elements) do Enum.reject(elements, fn element -> extract_attribute(element, "name") in (Facebook.tags() ++ Twitter.tags()) end) @@ -56,7 +74,7 @@ defmodule Furlex.Parser.HTML do defp extract_attribute(element, key) do case Floki.attribute(element, key) do - [attribute] -> attribute + [_] = attributes -> Floki.text(attributes, deep: false) _ -> nil end end diff --git a/lib/parser/json_ld.ex b/lib/parser/json_ld.ex new file mode 100644 index 0000000..177a2e2 --- /dev/null +++ b/lib/parser/json_ld.ex @@ -0,0 +1,58 @@ +defmodule Unfurl.Parser.JsonLD do + use Arrows + import Untangle + @behaviour Unfurl.Parser + + @json_library Application.compile_env(:unfurl, :json_library, Jason) + + @spec parse(String.t()) :: nil | {:ok, List.t()} + def parse(html, _opts \\ []) + def parse(html, _opts) when is_binary(html) do + html + |> Floki.parse_document() + ~> parse() + end + def parse(html, _opts) do + meta = "script[type=\"application/ld+json\"]" + + html + # |> debug("HTML elements") + |> Floki.find(meta) + # |> debug("JSON-LD elements") + |> case do + nil -> + {:ok, []} + + [] -> + {:ok, []} + + elements -> + json_ld = + elements + |> Enum.flat_map(&decode/1) + |> List.flatten() + |> Enum.uniq() + + {:ok, json_ld} + end + end + + defp decode(element) do + element + |> Floki.text(js: true) + |> String.trim() + # |> debug("JSON-LD element") + |> safe_decode() + end + + defp safe_decode(""), do: [] + defp safe_decode(json) do + case @json_library.decode(json) do + {:ok, data} -> List.wrap(data) + {:error, e} -> + warn(e, "Failed to decode JSON-LD") + [] + end + end + +end diff --git a/lib/furlex/parser.ex b/lib/parser/parser.ex similarity index 51% rename from lib/furlex/parser.ex rename to lib/parser/parser.ex index 491801f..be497d6 100644 --- a/lib/furlex/parser.ex +++ b/lib/parser/parser.ex @@ -1,4 +1,6 @@ -defmodule Furlex.Parser do +defmodule Unfurl.Parser do + use Arrows + @doc """ Parses the given HTML, returning a map structure of structured data keys mapping to their respective values, or an error. @@ -10,26 +12,38 @@ defmodule Furlex.Parser do the given match function """ @spec extract(List.t() | String.t(), String.t(), Function.t()) :: Map.t() - def extract(tags, html, match) when is_list(tags) do + def extract(tag, html, match, extract_attr \\ "content") + + def extract(tags, html, match, extract_attr) when is_list(tags) do tags - |> Stream.map(&extract(&1, html, match)) - |> Enum.reject(fn {_, v} -> is_nil(v) end) + |> Stream.map(&extract(&1, html, match, extract_attr)) + |> Enum.reject(fn + {_, v} -> is_nil(v) + nil -> true + end) |> Map.new() - |> group_keys() + |> maybe_group_keys() end - def extract(tag, html, match) do + def extract(tag, html, match, extract_attr) when is_binary(html) do html |> Floki.parse_document() - |> elem(1) + ~> extract(tag, ..., match, extract_attr) + end + + def extract(tag, html, match, extract_attr) do + html |> Floki.find(match.(tag)) |> case do nil -> nil + [] -> + nil + elements -> content = - case do_extract_content(elements) do + case do_extract_content(elements, extract_attr) do [] -> nil [element] -> element content -> content @@ -41,10 +55,13 @@ defmodule Furlex.Parser do @doc "Extracts a canonical url from the given raw HTML" @spec extract_canonical(String.t()) :: nil | String.t() - def extract_canonical(html) do + def extract_canonical(html) when is_binary(html) do html |> Floki.parse_document() - |> elem(1) + ~> extract_canonical() + end + def extract_canonical(html) do + html |> Floki.find("link[rel=\"canonical\"]") |> case do [] -> @@ -62,35 +79,39 @@ defmodule Furlex.Parser do ## Examples - iex> Application.put_env(:furlex, :group_keys?, false) - iex> Furlex.Parser.group_keys %{"twitter:app:id" => 123, "twitter:app:name" => "YouTube"} - %{"twitter:app:id" => 123, "twitter:app:name" => "YouTube"} - - iex> Application.put_env(:furlex, :group_keys?, true) - iex> Furlex.Parser.group_keys %{"twitter:app:id" => 123, "twitter:app:name" => "YouTube"} - %{ - "twitter" => %{ - "app" => %{ - "id" => 123, - "name" => "YouTube" + iex> Application.put_env(:unfurl, :group_keys?, false) + iex> Unfurl.Parser.maybe_group_keys %{"twitter:app:id" => 123, "twitter:app:name" => "YouTube"} + %{"twitter:app:id" => 123, "twitter:app:name" => "YouTube"} + + iex> Application.put_env(:unfurl, :group_keys?, true) + iex> Unfurl.Parser.maybe_group_keys %{"twitter:app:id" => 123, "twitter:app:name" => "YouTube"} + %{ + "twitter" => %{ + "app" => %{ + "id" => 123, + "name" => "YouTube" + } } } - } """ - @spec group_keys(Map.t()) :: Map.t() - def group_keys(map) - - def group_keys(map) do - if Application.get_env(:furlex, :group_keys?) do - Enum.reduce(map, %{}, fn - {_, v}, _acc when is_map(v) -> group_keys(v) - {k, v}, acc -> do_group_keys(k, v, acc) - end) + @spec maybe_group_keys(Map.t()) :: Map.t() + def maybe_group_keys(map) + + def maybe_group_keys(map) do + if Application.get_env(:unfurl, :group_keys?, true) do + do_group_keys(map) else map end end + defp do_group_keys(map) do + Enum.reduce(map, %{}, fn + {_, v}, _acc when is_map(v) -> do_group_keys(v) + {k, v}, acc -> do_group_keys(k, v, acc) + end) + end + defp do_group_keys(key, value, acc) do [h | t] = key |> String.split(":") |> Enum.reverse() base = Map.new([{h, value}]) @@ -115,10 +136,11 @@ defmodule Furlex.Parser do right end - defp do_extract_content(elements) do - Enum.map(elements, fn element -> + defp do_extract_content(elements, extract_attr) do + elements + |> Enum.map(fn element -> element - |> Floki.attribute("content") + |> Floki.attribute(extract_attr) |> Enum.at(0) end) end diff --git a/lib/parser/rel_me.ex b/lib/parser/rel_me.ex new file mode 100644 index 0000000..95e6fc0 --- /dev/null +++ b/lib/parser/rel_me.ex @@ -0,0 +1,40 @@ +defmodule Unfurl.Parser.RelMe do + import Untangle + + def parse(html, opts \\ []) + + def parse(html_tree, opts) when is_list(html_tree) do + with rel_me_hrefs when is_list(rel_me_hrefs) and rel_me_hrefs != [] <- + Floki.attribute(html_tree, "link[rel~=me]", "href") ++ + Floki.attribute(html_tree, "a[rel~=me]", "href") do + case opts[:rel_me_urls] do + rel_me_urls when is_list(rel_me_urls) -> + {:ok, + %{urls: rel_me_hrefs, verified: Enum.any?(rel_me_hrefs, fn x -> x in rel_me_urls end)}} + + _ -> + # no url(s) provided to verify against + {:ok, %{urls: rel_me_hrefs}} + end + else + e -> + warn(e, "Parsing error with rel=me") + {:ok, nil} + end + end + + def parse(html, opts) when is_list(html) do + with {:ok, html_tree} <- Floki.parse_document(html) do + parse(html_tree, opts) + else + e -> + warn(e, "Parsing error with rel=me") + {:ok, nil} + end + end + + def parse(html, _opts) do + warn(html, "Invalid HTML") + {:ok, nil} + end +end diff --git a/lib/furlex/parser/twitter.ex b/lib/parser/twitter.ex similarity index 64% rename from lib/furlex/parser/twitter.ex rename to lib/parser/twitter.ex index 91c57d2..10ef0c6 100644 --- a/lib/furlex/parser/twitter.ex +++ b/lib/parser/twitter.ex @@ -1,7 +1,8 @@ -defmodule Furlex.Parser.Twitter do - @behaviour Furlex.Parser +defmodule Unfurl.Parser.Twitter do + use Arrows + @behaviour Unfurl.Parser - alias Furlex.Parser + alias Unfurl.Parser @tags ~w( twitter:card twitter:site twitter:domain twitter:url twitter:site:id @@ -14,11 +15,17 @@ defmodule Furlex.Parser.Twitter do ) @spec parse(String.t()) :: {:ok, Map.t()} - def parse(html) do + def parse(html, _opts \\ []) + def parse(html, _opts) when is_binary(html) do + html + |> Floki.parse_document() + ~> parse() + end + def parse(html, _opts) do meta = &"meta[name=\"#{&1}\"]" map = Parser.extract(tags(), html, meta) - {:ok, map} + {:ok, Map.merge(map, Map.get(map, "twitter", %{})) |> Map.drop(["twitter"])} end @doc false @@ -28,5 +35,5 @@ defmodule Furlex.Parser.Twitter do |> Enum.uniq() end - defp config(key), do: Application.get_env(:furlex, __MODULE__)[key] + defp config(key), do: Application.get_env(:unfurl, __MODULE__)[key] end diff --git a/lib/unfurl.ex b/lib/unfurl.ex new file mode 100644 index 0000000..0d85645 --- /dev/null +++ b/lib/unfurl.ex @@ -0,0 +1,280 @@ +defmodule Unfurl do + @moduledoc "./README.md" |> File.stream!() |> Enum.drop(1) |> Enum.join() + + use Application + import Untangle + + alias Unfurl.{Fetcher, Parser, Oembed} + alias Unfurl.Parser.{Facebook, HTML, JsonLD, Twitter, RelMe} + + @doc false + def start(_type, _args) do + opts = [strategy: :one_for_one, name: Unfurl.Supervisor] + + children = [ + Unfurl.Oembed + ] + + Supervisor.start_link(children, opts) + end + + @doc """ + Unfurls a url + + Fetches oembed data if available, as well as the source HTML to be parsed by `unfurl_html/3`. + + Also accepts opts as a keyword list. + """ + @spec unfurl(String.t(), Keyword.t()) :: {:ok, Map.t()} | {:error, Atom.t()} + def unfurl(url, opts \\ []) do + case fetch(url, opts) do + {:ok, {body, status_code}, oembed_meta} when is_binary(body) -> + unfurl_html( + url, + body, + Keyword.merge(opts, + # because already done in `fetch/2` + skip_oembed_fetch: true, + extra: Enum.into(oembed_meta || %{}, %{status_code: status_code}) + ) + ) + + {:ok, {data, status_code}, oembed_meta} when is_map(oembed_meta) and oembed_meta !=%{} -> + debug(status_code, "Could not fetch URL but got some metadata via oembed") + {:ok, if(is_map(data), do: Map.merge(oembed_meta, data), else: oembed_meta) |> Map.put(:status_code, status_code)} + + {:ok, {data, status_code}, _} when is_map(data) -> + {:ok, data} + + other -> + error(other, "Could not fetch any metadata") + end + end + + @doc """ + Extracts data from the pre-fetched HTML source of a URL + + Checks for Twitter Card, Open Graph, JSON-LD, rel-me, and other HTML meta tags. + + Also tries to find and/or fetch (disable all with `skip_fetches: true`): + - a favicon (disable with `skip_favicon_fetch: true`) + - oembed info (disable with `skip_oembed_fetch: true`) + """ + def unfurl_html(url, body, opts \\ []) + def unfurl_html(url, body, opts) when is_binary(body) do + with {:ok, body} <- Floki.parse_document(body), + canonical_url <- Parser.extract_canonical(body), + {:ok, results} <- + parse( + body, + # ++ [urls: [url, canonical_url]] + opts + ) do + {:ok, + Map.merge(results || %{}, opts[:extra] || %{}) + |> Map.merge(%{ + content_type: "text/html", + canonical_url: if(canonical_url != url, do: canonical_url), + favicon: + if(!opts[:skip_favicon_fetch] and !opts[:skip_fetches], do: maybe_favicon(url, body)), + oembed: + opts[:extra][:oembed] || + if(!opts[:skip_oembed_fetch] and !opts[:skip_fetches], + do: Oembed.detect_and_fetch(url, body, opts) + ) + })} + end + end + + defp fetch(url, opts) do + fetch_oembed = Task.async(Oembed, :fetch, [url, opts]) + + fetch_html = if fetch_html_fn = opts[:fetch_html_fn] do + Task.async(fn -> fetch_html_fn.(url, opts) end) + else + Task.async(Fetcher, :fetch, [url, opts]) + end + + case Task.yield_many([fetch_oembed, fetch_html], timeout: 4000, on_timeout: :kill_task) do + [{_fetch_oembed, {:ok, {:ok, oembed}}}, {_fetch, {:ok, {:ok, body, status_code}}}] -> + # oembed found + HTML fetched + {:ok, {body, status_code}, oembed || Oembed.detect_and_fetch(url, body, opts)} + + [{_fetch_oembed, {:ok, {:ok, oembed}}}, other] -> + debug(other, "No HTML fetched") + # oembed was found from a known provider + {:ok, {nil, nil}, oembed} + + [other, {_fetch, {:ok, {:ok, body, status_code}}}] -> + debug(other, "No oembed found from known provider, try finding one in HTML") + {:ok, {body, status_code}, Oembed.detect_and_fetch(url, body, opts)} + + [other_oembed, other_html] -> + error(other_oembed, "Error fetching oembed") + error(other_html, "Error fetching HTML") + {:error, :fetch_error} + + e -> + error(e, "Error fetching oembed or HTML") + {:error, :fetch_error} + end + end + + defp parse(body, opts) do + parse = &Task.async(&1, :parse, [body, opts]) + tasks = Enum.map([Facebook, Twitter, JsonLD, RelMe, HTML], parse) + + with [facebook, twitter, json_ld, rel_me, other] <- Task.yield_many(tasks), + {_facebook, {:ok, {:ok, facebook}}} <- facebook, + {_twitter, {:ok, {:ok, twitter}}} <- twitter, + {_json_ld, {:ok, {:ok, json_ld}}} <- json_ld, + {_rel_me, {:ok, {:ok, rel_me}}} <- rel_me, + {_other, {:ok, {:ok, other}}} <- other do + {:ok, + %{ + facebook: facebook, + twitter: twitter, + json_ld: json_ld, + other: other, + rel_me: rel_me + }} + else + _ -> {:error, :parse_error} + end + end + + def maybe_favicon(url, body) do + case URI.parse(url) do + # %URI{host: nil, path: nil} -> + %URI{host: nil} -> + warn(url, "expected a valid URI, but got") + debug(body) + + with true <- body != [], + {:ok, url} <- Faviconic.find(nil, body) do + url + else + _ -> + nil + end + + # %URI{scheme: nil, host: nil, path: host_detected_as_path} -> + # with {:ok, url} <- Faviconic.find(host_detected_as_path, body) do + # url + # else _ -> + # nil + # end + + %URI{scheme: "doi"} -> + nil + + %URI{} -> + with {:ok, url} <- Faviconic.find(url, body) do + url + else + _ -> + nil + end + end + end + + @doc """ + Unshorten a URL by following redirects. + Returns {:ok, final_url} on success or {:error, reason} on failure. + + ## Examples + + iex> unshorten("https://bit.ly/example") + {:ok, "https://example.com/very/long/url"} + + """ + def unshorten(short_url) do + # TODO: integrate with `Unfurl.unfurl` so URL's are stored without shorteners (or at least the canonical url is added to metadata)? in which case we should avoid duplicated fetching of the head (also done by `Faviconic`) + + case Unfurl.Fetcher.head(short_url) do + {:ok, %{url: url} = head} -> + # The final URL after following redirects + debug(head, "headd") + {:ok, url} + + {:error, error} -> + error(error, "Failed to unshorten URL") + end + end + + def unshorten!(short_url) do + with {:ok, url} <- unshorten(short_url) do + url + else + _ -> + short_url + end + end + + def url_ip_address!(url) do + with {:ok, ip} <- url_ip_address(url) do + ip + else + _ -> + nil + end + end + + def url_ip_address(url) do + uri_host(url) + |> domain_ip_address() + end + + def domain_ip_address(host) when is_binary(host) do + with {:ok, {:hostent, _, _, _, _, [ip_tuple | _]}} <- + :inet.gethostbyname(String.to_charlist(host)) do + {:ok, :inet.ntoa(ip_tuple) |> to_string()} + else + e -> + error(e, "DNS resolution failed") + end + end + + def domain_ip_address(other), do: error(other, "Expected a hostname") + + def uri_host(%URI{host: nil} = _url), do: nil + def uri_host(%URI{host: host} = _url), do: host + + def uri_host(url) when is_binary(url) do + URI.parse(url) |> uri_host() + end + + @doc """ + Apply a function from this module to a list of items concurrently. + Returns a list of {:ok, final_url} or {:error, reason} tuples. + + ## Examples + + iex> apply_many(:unshorten, ["https://bit.ly/ex1", "https://bit.ly/ex2"]) + [ + {:ok, "https://example.com/long/url1"}, + {:ok, "https://example.com/long/url2"} + ] + + iex> apply_many(:unshorten!, ["https://bit.ly/ex1", "https://bit.ly/ex2"]) + [ + "https://example.com/long/url1", + "https://example.com/long/url2" + ] + + iex> apply_many(:unfurl, ["https://bit.ly/ex1", "https://bit.ly/ex2"], skip_oembed_fetch: true) + [ + {:ok, %{oembed: nil} = _meta}, + {:ok, %{oembed: nil} = _meta} + ] + + """ + def apply_many(fun, items, extra_args \\ []) when is_list(items) do + items + |> Task.async_stream(__MODULE__, fun, [extra_args], timeout: 10_000) + |> Enum.map(fn + {:ok, result} -> result + other -> error(other) + end) + end +end diff --git a/mix.exs b/mix.exs index b6f8a40..df30a76 100644 --- a/mix.exs +++ b/mix.exs @@ -1,20 +1,20 @@ -defmodule Furlex.Mixfile do +defmodule Unfurl.Mixfile do use Mix.Project def project do [ - app: :furlex, - version: "0.5.0", + app: :unfurl, + version: "0.6.2", elixir: "~> 1.10", build_embedded: Mix.env() == :prod, start_permanent: Mix.env() == :prod, description: description(), package: package(), deps: deps(), - name: "Furlex", - source_url: "https://github.com/claytongentry/furlex", + name: "Unfurl", + source_url: "https://github.com/bonfire-networks/unfurl", docs: [ - main: "Furlex", + main: "Unfurl", extras: ~w(README.md CHANGELOG.md) ] ] @@ -26,26 +26,32 @@ defmodule Furlex.Mixfile do def application do # Specify extra applications you'll use from Erlang/Elixir [ - mod: {Furlex, []}, - extra_applications: [:httpoison, :logger] + mod: {Unfurl, []}, + extra_applications: [:logger] ] end defp deps do [ - {:floki, "~> 0.30.0"}, - {:httpoison, "~> 1.8"}, + {:tesla, "~> 1.4"}, + # optional, but recommended adapter for tesla + {:hackney, "~> 1.17", optional: true}, + {:floki, "~> 0.32"}, {:jason, "~> 1.2"}, - {:plug_cowboy, "~> 2.0"}, + {:plug_cowboy, "~> 2.6"}, + {:arrows, "~> 0.2"}, + {:untangle, "~> 0.3"}, {:benchee, "~> 1.0", only: :dev}, - {:ex_doc, "~> 0.23", only: :dev, runtime: false}, - {:bypass, "~> 2.1", only: :test} + {:ex_doc, "~> 0.28", only: :dev, runtime: false}, + {:bypass, "~> 2.1", only: :test}, + {:faviconic, "~> 0.2.1"} + # {:faviconic, git: "https://github.com/bonfire-networks/faviconic"} ] end defp description do """ - Furlex is a structured data extraction tool written in Elixir. + Unfurl is a structured data extraction tool written in Elixir. It currently supports unfurling oEmbed, Twitter Card, Facebook Open Graph, JSON-LD and plain ole' HTML `` data out of any url you supply. @@ -54,13 +60,13 @@ defmodule Furlex.Mixfile do defp package do [ - name: :furlex, - files: ~w(doc lib mix.exs README.md LICENSE.md CHANGELOG.md), - maintainers: ["Clayton Gentry"], - licenses: ["Apache 2.0"], + name: :unfurl, + files: ~w(lib mix.exs README.md LICENSE.md CHANGELOG.md), + maintainers: ["Bonfire Networks"], + licenses: ["Apache-2.0"], links: %{ - "Github" => "http://github.com/claytongentry/furlex", - "Docs" => "http://hexdocs.pm/furlex" + "Github" => "https://github.com/bonfire-networks/unfurl", + "Docs" => "http://hexdocs.pm/unfurl" } ] end diff --git a/mix.lock b/mix.lock index 6611c10..abab1ba 100644 --- a/mix.lock +++ b/mix.lock @@ -1,33 +1,45 @@ %{ - "benchee": {:hex, :benchee, "1.0.1", "66b211f9bfd84bd97e6d1beaddf8fc2312aaabe192f776e8931cb0c16f53a521", [:mix], [{:deep_merge, "~> 1.0", [hex: :deep_merge, repo: "hexpm", optional: false]}], "hexpm", "3ad58ae787e9c7c94dd7ceda3b587ec2c64604563e049b2a0e8baafae832addb"}, + "arrows": {:hex, :arrows, "0.2.0", "11c078b52303413a987aa39c63d82d4b1b1d18f52bb0437de168c1010b396dbe", [:mix], [], "hexpm", "4c09a0b1084c6d40d7cbac68ec803eec0e3ecb7624cc693efdd79f17598e6085"}, + "benchee": {:hex, :benchee, "1.3.0", "f64e3b64ad3563fa9838146ddefb2d2f94cf5b473bdfd63f5ca4d0657bf96694", [:mix], [{:deep_merge, "~> 1.0", [hex: :deep_merge, repo: "hexpm", optional: false]}, {:statistex, "~> 1.0", [hex: :statistex, repo: "hexpm", optional: false]}, {:table, "~> 0.1.0", [hex: :table, repo: "hexpm", optional: true]}], "hexpm", "34f4294068c11b2bd2ebf2c59aac9c7da26ffa0068afdf3419f1b176e16c5f81"}, "bypass": {:hex, :bypass, "2.1.0", "909782781bf8e20ee86a9cabde36b259d44af8b9f38756173e8f5e2e1fabb9b1", [:mix], [{:plug, "~> 1.7", [hex: :plug, repo: "hexpm", optional: false]}, {:plug_cowboy, "~> 2.0", [hex: :plug_cowboy, repo: "hexpm", optional: false]}, {:ranch, "~> 1.3", [hex: :ranch, repo: "hexpm", optional: false]}], "hexpm", "d9b5df8fa5b7a6efa08384e9bbecfe4ce61c77d28a4282f79e02f1ef78d96b80"}, - "certifi": {:hex, :certifi, "2.5.3", "70bdd7e7188c804f3a30ee0e7c99655bc35d8ac41c23e12325f36ab449b70651", [:rebar3], [{:parse_trans, "~>3.3", [hex: :parse_trans, repo: "hexpm", optional: false]}], "hexpm", "ed516acb3929b101208a9d700062d520f3953da3b6b918d866106ffa980e1c10"}, - "cowboy": {:hex, :cowboy, "2.8.0", "f3dc62e35797ecd9ac1b50db74611193c29815401e53bac9a5c0577bd7bc667d", [:rebar3], [{:cowlib, "~> 2.9.1", [hex: :cowlib, repo: "hexpm", optional: false]}, {:ranch, "~> 1.7.1", [hex: :ranch, repo: "hexpm", optional: false]}], "hexpm", "4643e4fba74ac96d4d152c75803de6fad0b3fa5df354c71afdd6cbeeb15fac8a"}, - "cowboy_telemetry": {:hex, :cowboy_telemetry, "0.3.1", "ebd1a1d7aff97f27c66654e78ece187abdc646992714164380d8a041eda16754", [:rebar3], [{:cowboy, "~> 2.7", [hex: :cowboy, repo: "hexpm", optional: false]}, {:telemetry, "~> 0.4", [hex: :telemetry, repo: "hexpm", optional: false]}], "hexpm", "3a6efd3366130eab84ca372cbd4a7d3c3a97bdfcfb4911233b035d117063f0af"}, - "cowlib": {:hex, :cowlib, "2.9.1", "61a6c7c50cf07fdd24b2f45b89500bb93b6686579b069a89f88cb211e1125c78", [:rebar3], [], "hexpm", "e4175dc240a70d996156160891e1c62238ede1729e45740bdd38064dad476170"}, + "castore": {:hex, :castore, "1.0.5", "9eeebb394cc9a0f3ae56b813459f990abb0a3dedee1be6b27fdb50301930502f", [:mix], [], "hexpm", "8d7c597c3e4a64c395980882d4bca3cebb8d74197c590dc272cfd3b6a6310578"}, + "certifi": {:hex, :certifi, "2.12.0", "2d1cca2ec95f59643862af91f001478c9863c2ac9cb6e2f89780bfd8de987329", [:rebar3], [], "hexpm", "ee68d85df22e554040cdb4be100f33873ac6051387baf6a8f6ce82272340ff1c"}, + "cowboy": {:hex, :cowboy, "2.10.0", "ff9ffeff91dae4ae270dd975642997afe2a1179d94b1887863e43f681a203e26", [:make, :rebar3], [{:cowlib, "2.12.1", [hex: :cowlib, repo: "hexpm", optional: false]}, {:ranch, "1.8.0", [hex: :ranch, repo: "hexpm", optional: false]}], "hexpm", "3afdccb7183cc6f143cb14d3cf51fa00e53db9ec80cdcd525482f5e99bc41d6b"}, + "cowboy_telemetry": {:hex, :cowboy_telemetry, "0.4.0", "f239f68b588efa7707abce16a84d0d2acf3a0f50571f8bb7f56a15865aae820c", [:rebar3], [{:cowboy, "~> 2.7", [hex: :cowboy, repo: "hexpm", optional: false]}, {:telemetry, "~> 1.0", [hex: :telemetry, repo: "hexpm", optional: false]}], "hexpm", "7d98bac1ee4565d31b62d59f8823dfd8356a169e7fcbb83831b8a5397404c9de"}, + "cowlib": {:hex, :cowlib, "2.12.1", "a9fa9a625f1d2025fe6b462cb865881329b5caff8f1854d1cbc9f9533f00e1e1", [:make, :rebar3], [], "hexpm", "163b73f6367a7341b33c794c4e88e7dbfe6498ac42dcd69ef44c5bc5507c8db0"}, + "decorator": {:hex, :decorator, "1.4.0", "a57ac32c823ea7e4e67f5af56412d12b33274661bb7640ec7fc882f8d23ac419", [:mix], [], "hexpm", "0a07cedd9083da875c7418dea95b78361197cf2bf3211d743f6f7ce39656597f"}, "deep_merge": {:hex, :deep_merge, "1.0.0", "b4aa1a0d1acac393bdf38b2291af38cb1d4a52806cf7a4906f718e1feb5ee961", [:mix], [], "hexpm", "ce708e5f094b9cd4e8f2be4f00d2f4250c4095be93f8cd6d018c753894885430"}, - "earmark": {:hex, :earmark, "1.3.2", "b840562ea3d67795ffbb5bd88940b1bed0ed9fa32834915125ea7d02e35888a5", [:mix], [], "hexpm"}, - "earmark_parser": {:hex, :earmark_parser, "1.4.12", "b245e875ec0a311a342320da0551da407d9d2b65d98f7a9597ae078615af3449", [:mix], [], "hexpm", "711e2cc4d64abb7d566d43f54b78f7dc129308a63bc103fbd88550d2174b3160"}, - "ex_doc": {:hex, :ex_doc, "0.23.0", "a069bc9b0bf8efe323ecde8c0d62afc13d308b1fa3d228b65bca5cf8703a529d", [:mix], [{:earmark_parser, "~> 1.4.0", [hex: :earmark_parser, repo: "hexpm", optional: false]}, {:makeup_elixir, "~> 0.14", [hex: :makeup_elixir, repo: "hexpm", optional: false]}], "hexpm", "f5e2c4702468b2fd11b10d39416ddadd2fcdd173ba2a0285ebd92c39827a5a16"}, - "floki": {:hex, :floki, "0.30.0", "22ebbe681a5d3777cdd830ca091b1b806d33c3449c26312eadca7f7be685c0c8", [:mix], [{:html_entities, "~> 0.5.0", [hex: :html_entities, repo: "hexpm", optional: false]}], "hexpm", "a9e128a4ca9bb71f11affa315b6768a9ad326d5996ff1e92acf1d7a01a10076a"}, - "hackney": {:hex, :hackney, "1.17.0", "717ea195fd2f898d9fe9f1ce0afcc2621a41ecfe137fae57e7fe6e9484b9aa99", [:rebar3], [{:certifi, "~>2.5", [hex: :certifi, repo: "hexpm", optional: false]}, {:idna, "~>6.1.0", [hex: :idna, repo: "hexpm", optional: false]}, {:metrics, "~>1.0.0", [hex: :metrics, repo: "hexpm", optional: false]}, {:mimerl, "~>1.1", [hex: :mimerl, repo: "hexpm", optional: false]}, {:parse_trans, "~>3.3", [hex: :parse_trans, repo: "hexpm", optional: false]}, {:ssl_verify_fun, "~>1.1.0", [hex: :ssl_verify_fun, repo: "hexpm", optional: false]}, {:unicode_util_compat, "~>0.7.0", [hex: :unicode_util_compat, repo: "hexpm", optional: false]}], "hexpm", "64c22225f1ea8855f584720c0e5b3cd14095703af1c9fbc845ba042811dc671c"}, + "earmark_parser": {:hex, :earmark_parser, "1.4.39", "424642f8335b05bb9eb611aa1564c148a8ee35c9c8a8bba6e129d51a3e3c6769", [:mix], [], "hexpm", "06553a88d1f1846da9ef066b87b57c6f605552cfbe40d20bd8d59cc6bde41944"}, + "ex_doc": {:hex, :ex_doc, "0.31.1", "8a2355ac42b1cc7b2379da9e40243f2670143721dd50748bf6c3b1184dae2089", [:mix], [{:earmark_parser, "~> 1.4.39", [hex: :earmark_parser, repo: "hexpm", optional: false]}, {:makeup_c, ">= 0.1.1", [hex: :makeup_c, repo: "hexpm", optional: true]}, {:makeup_elixir, "~> 0.14", [hex: :makeup_elixir, repo: "hexpm", optional: false]}, {:makeup_erlang, "~> 0.1", [hex: :makeup_erlang, repo: "hexpm", optional: false]}], "hexpm", "3178c3a407c557d8343479e1ff117a96fd31bafe52a039079593fb0524ef61b0"}, + "faviconic": {:hex, :faviconic, "0.2.1", "4d6ff31f2ee01d8ed91e401bacbae64bcc78a2a0534deb9b7842f4031c849e0b", [:mix], [{:floki, "~> 0.32", [hex: :floki, repo: "hexpm", optional: false]}, {:req, "~> 0.3", [hex: :req, repo: "hexpm", optional: false]}, {:untangle, "~> 0.3", [hex: :untangle, repo: "hexpm", optional: false]}], "hexpm", "24f3628abd9b55d75e4f90edf6e8dfb97d0baf834345d40342232622d2094655"}, + "finch": {:hex, :finch, "0.18.0", "944ac7d34d0bd2ac8998f79f7a811b21d87d911e77a786bc5810adb75632ada4", [:mix], [{:castore, "~> 0.1 or ~> 1.0", [hex: :castore, repo: "hexpm", optional: false]}, {:mime, "~> 1.0 or ~> 2.0", [hex: :mime, repo: "hexpm", optional: false]}, {:mint, "~> 1.3", [hex: :mint, repo: "hexpm", optional: false]}, {:nimble_options, "~> 0.4 or ~> 1.0", [hex: :nimble_options, repo: "hexpm", optional: false]}, {:nimble_pool, "~> 0.2.6 or ~> 1.0", [hex: :nimble_pool, repo: "hexpm", optional: false]}, {:telemetry, "~> 0.4 or ~> 1.0", [hex: :telemetry, repo: "hexpm", optional: false]}], "hexpm", "69f5045b042e531e53edc2574f15e25e735b522c37e2ddb766e15b979e03aa65"}, + "floki": {:hex, :floki, "0.35.4", "cc947b446024732c07274ac656600c5c4dc014caa1f8fb2dfff93d275b83890d", [:mix], [], "hexpm", "27fa185d3469bd8fc5947ef0f8d5c4e47f0af02eb6b070b63c868f69e3af0204"}, + "hackney": {:hex, :hackney, "1.20.1", "8d97aec62ddddd757d128bfd1df6c5861093419f8f7a4223823537bad5d064e2", [:rebar3], [{:certifi, "~> 2.12.0", [hex: :certifi, repo: "hexpm", optional: false]}, {:idna, "~> 6.1.0", [hex: :idna, repo: "hexpm", optional: false]}, {:metrics, "~> 1.0.0", [hex: :metrics, repo: "hexpm", optional: false]}, {:mimerl, "~> 1.1", [hex: :mimerl, repo: "hexpm", optional: false]}, {:parse_trans, "3.4.1", [hex: :parse_trans, repo: "hexpm", optional: false]}, {:ssl_verify_fun, "~> 1.1.0", [hex: :ssl_verify_fun, repo: "hexpm", optional: false]}, {:unicode_util_compat, "~> 0.7.0", [hex: :unicode_util_compat, repo: "hexpm", optional: false]}], "hexpm", "fe9094e5f1a2a2c0a7d10918fee36bfec0ec2a979994cff8cfe8058cd9af38e3"}, + "hpax": {:hex, :hpax, "0.1.2", "09a75600d9d8bbd064cdd741f21fc06fc1f4cf3d0fcc335e5aa19be1a7235c84", [:mix], [], "hexpm", "2c87843d5a23f5f16748ebe77969880e29809580efdaccd615cd3bed628a8c13"}, "html_entities": {:hex, :html_entities, "0.5.2", "9e47e70598da7de2a9ff6af8758399251db6dbb7eebe2b013f2bbd2515895c3c", [:mix], [], "hexpm", "c53ba390403485615623b9531e97696f076ed415e8d8058b1dbaa28181f4fdcc"}, - "httpoison": {:hex, :httpoison, "1.8.0", "6b85dea15820b7804ef607ff78406ab449dd78bed923a49c7160e1886e987a3d", [:mix], [{:hackney, "~> 1.17", [hex: :hackney, repo: "hexpm", optional: false]}], "hexpm", "28089eaa98cf90c66265b6b5ad87c59a3729bea2e74e9d08f9b51eb9729b3c3a"}, - "idna": {:hex, :idna, "6.1.1", "8a63070e9f7d0c62eb9d9fcb360a7de382448200fbbd1b106cc96d3d8099df8d", [:rebar3], [{:unicode_util_compat, "~>0.7.0", [hex: :unicode_util_compat, repo: "hexpm", optional: false]}], "hexpm", "92376eb7894412ed19ac475e4a86f7b413c1b9fbb5bd16dccd57934157944cea"}, - "jason": {:hex, :jason, "1.2.2", "ba43e3f2709fd1aa1dce90aaabfd039d000469c05c56f0b8e31978e03fa39052", [:mix], [{:decimal, "~> 1.0 or ~> 2.0", [hex: :decimal, repo: "hexpm", optional: true]}], "hexpm", "18a228f5f0058ee183f29f9eae0805c6e59d61c3b006760668d8d18ff0d12179"}, - "makeup": {:hex, :makeup, "1.0.5", "d5a830bc42c9800ce07dd97fa94669dfb93d3bf5fcf6ea7a0c67b2e0e4a7f26c", [:mix], [{:nimble_parsec, "~> 0.5 or ~> 1.0", [hex: :nimble_parsec, repo: "hexpm", optional: false]}], "hexpm", "cfa158c02d3f5c0c665d0af11512fed3fba0144cf1aadee0f2ce17747fba2ca9"}, - "makeup_elixir": {:hex, :makeup_elixir, "0.15.1", "b5888c880d17d1cc3e598f05cdb5b5a91b7b17ac4eaf5f297cb697663a1094dd", [:mix], [{:makeup, "~> 1.0", [hex: :makeup, repo: "hexpm", optional: false]}, {:nimble_parsec, "~> 1.1", [hex: :nimble_parsec, repo: "hexpm", optional: false]}], "hexpm", "db68c173234b07ab2a07f645a5acdc117b9f99d69ebf521821d89690ae6c6ec8"}, + "idna": {:hex, :idna, "6.1.1", "8a63070e9f7d0c62eb9d9fcb360a7de382448200fbbd1b106cc96d3d8099df8d", [:rebar3], [{:unicode_util_compat, "~> 0.7.0", [hex: :unicode_util_compat, repo: "hexpm", optional: false]}], "hexpm", "92376eb7894412ed19ac475e4a86f7b413c1b9fbb5bd16dccd57934157944cea"}, + "jason": {:hex, :jason, "1.4.1", "af1504e35f629ddcdd6addb3513c3853991f694921b1b9368b0bd32beb9f1b63", [:mix], [{:decimal, "~> 1.0 or ~> 2.0", [hex: :decimal, repo: "hexpm", optional: true]}], "hexpm", "fbb01ecdfd565b56261302f7e1fcc27c4fb8f32d56eab74db621fc154604a7a1"}, + "makeup": {:hex, :makeup, "1.1.1", "fa0bc768698053b2b3869fa8a62616501ff9d11a562f3ce39580d60860c3a55e", [:mix], [{:nimble_parsec, "~> 1.2.2 or ~> 1.3", [hex: :nimble_parsec, repo: "hexpm", optional: false]}], "hexpm", "5dc62fbdd0de44de194898b6710692490be74baa02d9d108bc29f007783b0b48"}, + "makeup_elixir": {:hex, :makeup_elixir, "0.16.1", "cc9e3ca312f1cfeccc572b37a09980287e243648108384b97ff2b76e505c3555", [:mix], [{:makeup, "~> 1.0", [hex: :makeup, repo: "hexpm", optional: false]}, {:nimble_parsec, "~> 1.2.3 or ~> 1.3", [hex: :nimble_parsec, repo: "hexpm", optional: false]}], "hexpm", "e127a341ad1b209bd80f7bd1620a15693a9908ed780c3b763bccf7d200c767c6"}, + "makeup_erlang": {:hex, :makeup_erlang, "0.1.4", "29563475afa9b8a2add1b7a9c8fb68d06ca7737648f28398e04461f008b69521", [:mix], [{:makeup, "~> 1.0", [hex: :makeup, repo: "hexpm", optional: false]}], "hexpm", "f4ed47ecda66de70dd817698a703f8816daa91272e7e45812469498614ae8b29"}, "metrics": {:hex, :metrics, "1.0.1", "25f094dea2cda98213cecc3aeff09e940299d950904393b2a29d191c346a8486", [:rebar3], [], "hexpm", "69b09adddc4f74a40716ae54d140f93beb0fb8978d8636eaded0c31b6f099f16"}, - "mime": {:hex, :mime, "1.5.0", "203ef35ef3389aae6d361918bf3f952fa17a09e8e43b5aa592b93eba05d0fb8d", [:mix], [], "hexpm", "55a94c0f552249fc1a3dd9cd2d3ab9de9d3c89b559c2bd01121f824834f24746"}, + "mime": {:hex, :mime, "2.0.5", "dc34c8efd439abe6ae0343edbb8556f4d63f178594894720607772a041b04b02", [:mix], [], "hexpm", "da0d64a365c45bc9935cc5c8a7fc5e49a0e0f9932a761c55d6c52b142780a05c"}, "mimerl": {:hex, :mimerl, "1.2.0", "67e2d3f571088d5cfd3e550c383094b47159f3eee8ffa08e64106cdf5e981be3", [:rebar3], [], "hexpm", "f278585650aa581986264638ebf698f8bb19df297f66ad91b18910dfc6e19323"}, - "mochiweb": {:hex, :mochiweb, "2.18.0", "eb55f1db3e6e960fac4e6db4e2db9ec3602cc9f30b86cd1481d56545c3145d2e", [:rebar3], [], "hexpm"}, - "nimble_parsec": {:hex, :nimble_parsec, "1.1.0", "3a6fca1550363552e54c216debb6a9e95bd8d32348938e13de5eda962c0d7f89", [:mix], [], "hexpm", "08eb32d66b706e913ff748f11694b17981c0b04a33ef470e33e11b3d3ac8f54b"}, - "parse_trans": {:hex, :parse_trans, "3.3.1", "16328ab840cc09919bd10dab29e431da3af9e9e7e7e6f0089dd5a2d2820011d8", [:rebar3], [], "hexpm", "07cd9577885f56362d414e8c4c4e6bdf10d43a8767abb92d24cbe8b24c54888b"}, - "plug": {:hex, :plug, "1.11.0", "f17217525597628298998bc3baed9f8ea1fa3f1160aa9871aee6df47a6e4d38e", [:mix], [{:mime, "~> 1.0", [hex: :mime, repo: "hexpm", optional: false]}, {:plug_crypto, "~> 1.1.1 or ~> 1.2", [hex: :plug_crypto, repo: "hexpm", optional: false]}, {:telemetry, "~> 0.4", [hex: :telemetry, repo: "hexpm", optional: false]}], "hexpm", "2d9c633f0499f9dc5c2fd069161af4e2e7756890b81adcbb2ceaa074e8308876"}, - "plug_cowboy": {:hex, :plug_cowboy, "2.4.1", "779ba386c0915027f22e14a48919a9545714f849505fa15af2631a0d298abf0f", [:mix], [{:cowboy, "~> 2.7", [hex: :cowboy, repo: "hexpm", optional: false]}, {:cowboy_telemetry, "~> 0.3", [hex: :cowboy_telemetry, repo: "hexpm", optional: false]}, {:plug, "~> 1.7", [hex: :plug, repo: "hexpm", optional: false]}, {:telemetry, "~> 0.4", [hex: :telemetry, repo: "hexpm", optional: false]}], "hexpm", "d72113b6dff7b37a7d9b2a5b68892808e3a9a752f2bf7e503240945385b70507"}, - "plug_crypto": {:hex, :plug_crypto, "1.2.1", "5c854427528bf61d159855cedddffc0625e2228b5f30eff76d5a4de42d896ef4", [:mix], [], "hexpm", "6961c0e17febd9d0bfa89632d391d2545d2e0eb73768f5f50305a23961d8782c"}, - "ranch": {:hex, :ranch, "1.7.1", "6b1fab51b49196860b733a49c07604465a47bdb78aa10c1c16a3d199f7f8c881", [:rebar3], [], "hexpm", "451d8527787df716d99dc36162fca05934915db0b6141bbdac2ea8d3c7afc7d7"}, - "ssl_verify_fun": {:hex, :ssl_verify_fun, "1.1.6", "cf344f5692c82d2cd7554f5ec8fd961548d4fd09e7d22f5b62482e5aeaebd4b0", [:make, :mix, :rebar3], [], "hexpm", "bdb0d2471f453c88ff3908e7686f86f9be327d065cc1ec16fa4540197ea04680"}, - "telemetry": {:hex, :telemetry, "0.4.2", "2808c992455e08d6177322f14d3bdb6b625fbcfd233a73505870d8738a2f4599", [:rebar3], [], "hexpm", "2d1419bd9dda6a206d7b5852179511722e2b18812310d304620c7bd92a13fcef"}, + "mint": {:hex, :mint, "1.5.2", "4805e059f96028948870d23d7783613b7e6b0e2fb4e98d720383852a760067fd", [:mix], [{:castore, "~> 0.1.0 or ~> 1.0", [hex: :castore, repo: "hexpm", optional: true]}, {:hpax, "~> 0.1.1", [hex: :hpax, repo: "hexpm", optional: false]}], "hexpm", "d77d9e9ce4eb35941907f1d3df38d8f750c357865353e21d335bdcdf6d892a02"}, + "nimble_options": {:hex, :nimble_options, "1.1.0", "3b31a57ede9cb1502071fade751ab0c7b8dbe75a9a4c2b5bbb0943a690b63172", [:mix], [], "hexpm", "8bbbb3941af3ca9acc7835f5655ea062111c9c27bcac53e004460dfd19008a99"}, + "nimble_ownership": {:hex, :nimble_ownership, "0.2.1", "3e44c72ebe8dd213db4e13aff4090aaa331d158e72ce1891d02e0ffb05a1eb2d", [:mix], [], "hexpm", "bf38d2ef4fb990521a4ecf112843063c1f58a5c602484af4c7977324042badee"}, + "nimble_parsec": {:hex, :nimble_parsec, "1.4.0", "51f9b613ea62cfa97b25ccc2c1b4216e81df970acd8e16e8d1bdc58fef21370d", [:mix], [], "hexpm", "9c565862810fb383e9838c1dd2d7d2c437b3d13b267414ba6af33e50d2d1cf28"}, + "nimble_pool": {:hex, :nimble_pool, "1.0.0", "5eb82705d138f4dd4423f69ceb19ac667b3b492ae570c9f5c900bb3d2f50a847", [:mix], [], "hexpm", "80be3b882d2d351882256087078e1b1952a28bf98d0a287be87e4a24a710b67a"}, + "parse_trans": {:hex, :parse_trans, "3.4.1", "6e6aa8167cb44cc8f39441d05193be6e6f4e7c2946cb2759f015f8c56b76e5ff", [:rebar3], [], "hexpm", "620a406ce75dada827b82e453c19cf06776be266f5a67cff34e1ef2cbb60e49a"}, + "plug": {:hex, :plug, "1.15.3", "712976f504418f6dff0a3e554c40d705a9bcf89a7ccef92fc6a5ef8f16a30a97", [:mix], [{:mime, "~> 1.0 or ~> 2.0", [hex: :mime, repo: "hexpm", optional: false]}, {:plug_crypto, "~> 1.1.1 or ~> 1.2 or ~> 2.0", [hex: :plug_crypto, repo: "hexpm", optional: false]}, {:telemetry, "~> 0.4.3 or ~> 1.0", [hex: :telemetry, repo: "hexpm", optional: false]}], "hexpm", "cc4365a3c010a56af402e0809208873d113e9c38c401cabd88027ef4f5c01fd2"}, + "plug_cowboy": {:hex, :plug_cowboy, "2.7.0", "3ae9369c60641084363b08fe90267cbdd316df57e3557ea522114b30b63256ea", [:mix], [{:cowboy, "~> 2.7.0 or ~> 2.8.0 or ~> 2.9.0 or ~> 2.10.0", [hex: :cowboy, repo: "hexpm", optional: false]}, {:cowboy_telemetry, "~> 0.3", [hex: :cowboy_telemetry, repo: "hexpm", optional: false]}, {:plug, "~> 1.14", [hex: :plug, repo: "hexpm", optional: false]}], "hexpm", "d85444fb8aa1f2fc62eabe83bbe387d81510d773886774ebdcb429b3da3c1a4a"}, + "plug_crypto": {:hex, :plug_crypto, "2.0.0", "77515cc10af06645abbfb5e6ad7a3e9714f805ae118fa1a70205f80d2d70fe73", [:mix], [], "hexpm", "53695bae57cc4e54566d993eb01074e4d894b65a3766f1c43e2c61a1b0f45ea9"}, + "ranch": {:hex, :ranch, "1.8.0", "8c7a100a139fd57f17327b6413e4167ac559fbc04ca7448e9be9057311597a1d", [:make, :rebar3], [], "hexpm", "49fbcfd3682fab1f5d109351b61257676da1a2fdbe295904176d5e521a2ddfe5"}, + "req": {:hex, :req, "0.4.11", "cb19f87d5251e7de30cfc67d1899696b290711092207c6b2e8fc2294f237fcdc", [:mix], [{:aws_signature, "~> 0.3.2", [hex: :aws_signature, repo: "hexpm", optional: true]}, {:brotli, "~> 0.3.1", [hex: :brotli, repo: "hexpm", optional: true]}, {:ezstd, "~> 1.0", [hex: :ezstd, repo: "hexpm", optional: true]}, {:finch, "~> 0.17", [hex: :finch, repo: "hexpm", optional: false]}, {:jason, "~> 1.0", [hex: :jason, repo: "hexpm", optional: false]}, {:mime, "~> 1.6 or ~> 2.0", [hex: :mime, repo: "hexpm", optional: false]}, {:nimble_csv, "~> 1.0", [hex: :nimble_csv, repo: "hexpm", optional: true]}, {:nimble_ownership, "~> 0.2.0", [hex: :nimble_ownership, repo: "hexpm", optional: false]}, {:plug, "~> 1.0", [hex: :plug, repo: "hexpm", optional: true]}], "hexpm", "bbf4f2393c649fa4146a3b8470e2a7e8c9b23e4100a16c75f5e7d1d3d33144f3"}, + "ssl_verify_fun": {:hex, :ssl_verify_fun, "1.1.7", "354c321cf377240c7b8716899e182ce4890c5938111a1296add3ec74cf1715df", [:make, :mix, :rebar3], [], "hexpm", "fe4c190e8f37401d30167c8c405eda19469f34577987c76dde613e838bbc67f8"}, + "statistex": {:hex, :statistex, "1.0.0", "f3dc93f3c0c6c92e5f291704cf62b99b553253d7969e9a5fa713e5481cd858a5", [:mix], [], "hexpm", "ff9d8bee7035028ab4742ff52fc80a2aa35cece833cf5319009b52f1b5a86c27"}, + + "tesla": {:hex, :tesla, "1.8.0", "d511a4f5c5e42538d97eef7c40ec4f3e44effdc5068206f42ed859e09e51d1fd", [:mix], [{:castore, "~> 0.1 or ~> 1.0", [hex: :castore, repo: "hexpm", optional: true]}, {:exjsx, ">= 3.0.0", [hex: :exjsx, repo: "hexpm", optional: true]}, {:finch, "~> 0.13", [hex: :finch, repo: "hexpm", optional: true]}, {:fuse, "~> 2.4", [hex: :fuse, repo: "hexpm", optional: true]}, {:gun, ">= 1.0.0", [hex: :gun, repo: "hexpm", optional: true]}, {:hackney, "~> 1.6", [hex: :hackney, repo: "hexpm", optional: true]}, {:ibrowse, "4.4.2", [hex: :ibrowse, repo: "hexpm", optional: true]}, {:jason, ">= 1.0.0", [hex: :jason, repo: "hexpm", optional: true]}, {:mime, "~> 1.0 or ~> 2.0", [hex: :mime, repo: "hexpm", optional: false]}, {:mint, "~> 1.0", [hex: :mint, repo: "hexpm", optional: true]}, {:msgpax, "~> 2.3", [hex: :msgpax, repo: "hexpm", optional: true]}, {:poison, ">= 1.0.0", [hex: :poison, repo: "hexpm", optional: true]}, {:telemetry, "~> 0.4 or ~> 1.0", [hex: :telemetry, repo: "hexpm", optional: true]}], "hexpm", "10501f360cd926a309501287470372af1a6e1cbed0f43949203a4c13300bc79f"}, "unicode_util_compat": {:hex, :unicode_util_compat, "0.7.0", "bc84380c9ab48177092f43ac89e4dfa2c6d62b40b8bd132b1059ecc7232f9a78", [:rebar3], [], "hexpm", "25eee6d67df61960cf6a794239566599b09e17e668d3700247bc498638152521"}, + "untangle": {:hex, :untangle, "0.3.0", "8fb3038ddc96aeec9989aa7b410355297231bd5cee7b718e24aa1efe174fce3a", [:mix], [{:decorator, "~> 1.4", [hex: :decorator, repo: "hexpm", optional: false]}], "hexpm", "9ac26c397155636feaed260b8edcfa01f62af62ab929bad385fbb94beacb5a38"}, } diff --git a/test/fixtures/rel_me_anchor.html b/test/fixtures/rel_me_anchor.html new file mode 100644 index 0000000..c8cd39f --- /dev/null +++ b/test/fixtures/rel_me_anchor.html @@ -0,0 +1,14 @@ + + + + + Blog + + +
+

Lorem ipsum

+

Lorem ipsum dolor sit ameph, …

+ test account +
+ + diff --git a/test/fixtures/rel_me_anchor_nofollow.html b/test/fixtures/rel_me_anchor_nofollow.html new file mode 100644 index 0000000..db20b9e --- /dev/null +++ b/test/fixtures/rel_me_anchor_nofollow.html @@ -0,0 +1,14 @@ + + + + + Blog + + +
+

Lorem ipsum

+

Lorem ipsum dolor sit ameph, …

+ test account +
+ + diff --git a/test/fixtures/rel_me_link.html b/test/fixtures/rel_me_link.html new file mode 100644 index 0000000..fdd826d --- /dev/null +++ b/test/fixtures/rel_me_link.html @@ -0,0 +1,14 @@ + + + + + Blog + + + +
+

Lorem ipsum

+

Lorem ipsum dolor sit ameph, …

+
+ + diff --git a/test/fixtures/rel_me_null.html b/test/fixtures/rel_me_null.html new file mode 100644 index 0000000..f238f17 --- /dev/null +++ b/test/fixtures/rel_me_null.html @@ -0,0 +1,14 @@ + + + + + Blog + + +
+

Lorem ipsum

+

Lorem ipsum dolor sit ameph, …

+ test account +
+ + diff --git a/test/fixtures/rel_me_third_party.html b/test/fixtures/rel_me_third_party.html new file mode 100644 index 0000000..ec4eacf --- /dev/null +++ b/test/fixtures/rel_me_third_party.html @@ -0,0 +1,16 @@ + + + + + Blog + + +
+

Lorem ipsum

+

Lorem ipsum dolor sit ameph, …

+ test account + test2 account + test3 account +
+ + diff --git a/test/furlex/fetcher_test.exs b/test/furlex/fetcher_test.exs index f53a6d6..8de9916 100644 --- a/test/furlex/fetcher_test.exs +++ b/test/furlex/fetcher_test.exs @@ -1,7 +1,7 @@ -defmodule Furlex.FetcherTest do +defmodule Unfurl.FetcherTest do use ExUnit.Case - alias Furlex.Fetcher + alias Unfurl.Fetcher doctest Fetcher @@ -12,6 +12,7 @@ defmodule Furlex.FetcherTest do {:ok, bypass: bypass, url: url} end + @tag :fixme test "fetches url", %{bypass: bypass, url: url} do Bypass.expect_once(bypass, &handle/1) @@ -19,11 +20,6 @@ defmodule Furlex.FetcherTest do assert body =~ "Test HTML" end - test "fetches url with options", %{url: url} do - assert {:error, %HTTPoison.Error{reason: :checkout_timeout}} == - Fetcher.fetch(url, timeout: 0) - end - def handle(conn) do body = [__DIR__ | ~w(.. fixtures test.html)] diff --git a/test/furlex/oembed_test.exs b/test/furlex/oembed_test.exs index a89a41f..540b9f4 100644 --- a/test/furlex/oembed_test.exs +++ b/test/furlex/oembed_test.exs @@ -1,18 +1,18 @@ -defmodule Furlex.OembedTest do +defmodule Unfurl.OembedTest do use ExUnit.Case - alias Furlex.Oembed + alias Unfurl.Oembed setup do bypass = Bypass.open() url = "http://localhost:#{bypass.port}" - config = Application.get_env(:furlex, Oembed, []) + config = Application.get_env(:unfurl, Oembed, []) new_config = Keyword.put(config, :oembed_host, url) - Application.put_env(:furlex, Oembed, new_config) + Application.put_env(:unfurl, Oembed, new_config) on_exit(fn -> - Application.put_env(:furlex, Oembed, config) + Application.put_env(:unfurl, Oembed, config) :ok end) @@ -20,8 +20,9 @@ defmodule Furlex.OembedTest do {:ok, bypass: bypass} end + @tag :fixme test "returns endpoint from url", %{bypass: bypass} do - Bypass.expect(bypass, &handle/1) + # Bypass.expect(bypass, &handle/1) assert {:error, :no_oembed_provider} == Oembed.endpoint_from_url("foobar") @@ -31,7 +32,7 @@ defmodule Furlex.OembedTest do {:ok, endpoint} = Oembed.endpoint_from_url(url, params, skip_cache?: true) - assert endpoint == "https://vimeo.com/api/oembed.json" + assert to_string(endpoint) =~ "https://vimeo.com/api/oembed.json" end def handle(%{request_path: "/providers.json"} = conn) do diff --git a/test/furlex/parser/common_html_test.exs b/test/furlex/parser/common_html_test.exs new file mode 100644 index 0000000..2d6d3f4 --- /dev/null +++ b/test/furlex/parser/common_html_test.exs @@ -0,0 +1,42 @@ +defmodule Unfurl.Parser.CommonHTMLTest do + use ExUnit.Case + + alias Unfurl.Parser.HTML + + test "parses CustomHTML meta data" do + html = + [__DIR__ | ~w(.. .. fixtures test.html)] + |> Path.join() + |> File.read!() + + assert {:ok, meta} = HTML.parse(html) + + assert meta == %{ + "title" => "Test HTML", + "description" => "This is test content." + } + end + + test "dedupes meta data" do + html = + [__DIR__ | ~w(.. .. fixtures duplicate_meta.html)] + |> Path.join() + |> File.read!() + + assert {:ok, meta} = HTML.parse(html) + # IO.inspect(meta) + + assert meta["generator"] == "Loja Integrada" + + assert meta["google-site-verification"] == [ + "GbnYBmQLHGrgQRVEi4b2fzcrAA81TMh86T3Z1kDDW-c", + "og5Ef6ntOLY0CrU0H8mURx_WwrlZc9Hz2HDXQGWOdAg", + "66Kpz8sWyMtS35U7Eodir6sXoV5gJe7a9kNN9xQQnYE" + ] + + assert meta["robots"] == "index, follow" + + # Ensure resultant meta is encodable + assert {:ok, _json} = Jason.encode(meta) + end +end diff --git a/test/furlex/parser/facebook_test.exs b/test/furlex/parser/facebook_test.exs index 0c283cb..7ba6474 100644 --- a/test/furlex/parser/facebook_test.exs +++ b/test/furlex/parser/facebook_test.exs @@ -1,7 +1,7 @@ -defmodule Furlex.Parser.FacebookTest do +defmodule Unfurl.Parser.FacebookTest do use ExUnit.Case - alias Furlex.Parser.Facebook + alias Unfurl.Parser.Facebook doctest Facebook @@ -10,11 +10,6 @@ defmodule Furlex.Parser.FacebookTest do " "content=\"www.example.com\"/>" - assert {:ok, - %{ - "og" => %{ - "url" => "www.example.com" - } - }} == Facebook.parse(html) + assert {:ok, %{"url" => "www.example.com"}} == Facebook.parse(html) end end diff --git a/test/furlex/parser/html_test.exs b/test/furlex/parser/html_test.exs index d8399f7..eb5d907 100644 --- a/test/furlex/parser/html_test.exs +++ b/test/furlex/parser/html_test.exs @@ -1,9 +1,9 @@ -defmodule Furlex.Parser.HTMLTest do +defmodule Unfurl.Parser.HTMLTest do use ExUnit.Case - alias Furlex.Parser.HTML + alias Unfurl.Parser.HTML - @json_library Application.get_env(:furlex, :json_library, Jason) + @json_library Application.compile_env(:unfurl, :json_library, Jason) doctest HTML @@ -14,7 +14,7 @@ defmodule Furlex.Parser.HTMLTest do |> File.read!() assert {:ok, meta} = HTML.parse(html) - assert meta == %{"description" => "This is test content."} + assert %{"description" => "This is test content."} = meta end test "dedupes meta data" do diff --git a/test/furlex/parser/json_ld_test.exs b/test/furlex/parser/json_ld_test.exs index 9c2cf84..af31967 100644 --- a/test/furlex/parser/json_ld_test.exs +++ b/test/furlex/parser/json_ld_test.exs @@ -1,7 +1,7 @@ -defmodule Furlex.Parser.JsonLDTest do +defmodule Unfurl.Parser.JsonLDTest do use ExUnit.Case - alias Furlex.Parser.JsonLD + alias Unfurl.Parser.JsonLD doctest JsonLD @@ -22,4 +22,26 @@ defmodule Furlex.Parser.JsonLDTest do assert Map.get(json_ld, "@type") == "WebSite" assert Map.get(json_ld, "url") == "https://www.example.com" end + + test "ignores invalid JSON-LD and does not raise" do + html = """ + + + + + + + """ + + assert {:ok, [json_ld]} = JsonLD.parse(html) + assert Map.get(json_ld, "name") == "Alice" + assert Map.get(json_ld, "@type") == "Person" + end end diff --git a/test/furlex/parser/rel_me_test.exs b/test/furlex/parser/rel_me_test.exs new file mode 100644 index 0000000..9a49fe4 --- /dev/null +++ b/test/furlex/parser/rel_me_test.exs @@ -0,0 +1,144 @@ +defmodule Unfurl.RelMeTest do + use ExUnit.Case + import Tesla.Mock + import Untangle + + setup_all do + Tesla.Mock.mock_global(fn + %{method: :get, url: url} -> + get(url, nil, nil, nil) + + _ -> + %Tesla.Env{status: 304, body: "{error: 'No implemented mock response'}"} + end) + |> IO.inspect(label: "setup done") + + :ok + end + + def get("http://example.com/rel_me/anchor", _, _, _) do + {:ok, + %Tesla.Env{ + status: 200, + body: + "../../fixtures/rel_me_anchor.html" + |> Path.expand(__DIR__) + |> File.read!() + }} + end + + def get("http://example.com/rel_me/anchor_nofollow", _, _, _) do + {:ok, + %Tesla.Env{ + status: 200, + body: + "../../fixtures/rel_me_anchor_nofollow.html" + |> Path.expand(__DIR__) + |> File.read!() + }} + end + + def get("http://example.com/rel_me/link", _, _, _) do + {:ok, + %Tesla.Env{ + status: 200, + body: + "../../fixtures/rel_me_link.html" + |> Path.expand(__DIR__) + |> File.read!() + }} + end + + def get("http://example.com/rel_me/third_party", _, _, _) do + {:ok, + %Tesla.Env{ + status: 200, + body: + "../../fixtures/rel_me_third_party.html" + |> Path.expand(__DIR__) + |> File.read!() + }} + end + + def get("http://example.com/rel_me/null", _, _, _) do + {:ok, + %Tesla.Env{ + status: 200, + body: + "../../fixtures/rel_me_null.html" + |> Path.expand(__DIR__) + |> File.read!() + }} + end + + def get("https://oembed.com/providers.json", _, _, _) do + {:ok, + %Tesla.Env{ + status: 200, + body: + "../../fixtures/providers.json" + |> Path.expand(__DIR__) + |> File.read!() + }} + end + + def get(_, _, _, _) do + %Tesla.Env{status: 304, body: "{error: 'No implemented mock response'}"} + end + + describe "rel_me" do + test "parse works for valid rel=me links" do + hrefs = ["https://social.example.org/users/test"] + + assert {:ok, %{rel_me: nil}} = Unfurl.unfurl("http://example.com/rel_me/null") + + assert {:ok, + %{ + rel_me: %{ + urls: [ + "https://social.example.org/users/test2nd", + "https://social.example.org/users/test3rd" + ] + } + }} = Unfurl.unfurl("http://example.com/rel_me/third_party") + + assert {:ok, %{rel_me: nil}} = + Unfurl.unfurl("http://example.com/rel_me/error") + + assert {:ok, %{rel_me: %{urls: hrefs}}} = Unfurl.unfurl("http://example.com/rel_me/link") + assert {:ok, %{rel_me: %{urls: hrefs}}} = Unfurl.unfurl("http://example.com/rel_me/anchor") + + assert {:ok, %{rel_me: %{urls: hrefs}}} = + Unfurl.unfurl("http://example.com/rel_me/anchor_nofollow") + end + + test "parse returns true for valid rel=me links when actor link provided" do + hrefs = ["https://social.example.org/users/test"] + + assert {:ok, %{rel_me: nil}} = + Unfurl.unfurl("http://example.com/rel_me/null", rel_me_urls: hrefs) + + assert {:ok, + %{ + rel_me: %{ + urls: [ + "https://social.example.org/users/test2nd", + "https://social.example.org/users/test3rd" + ] + } + }} = Unfurl.unfurl("http://example.com/rel_me/third_party", rel_me_urls: hrefs) + + assert {:ok, %{rel_me: nil}} = + Unfurl.unfurl("http://example.com/rel_me/error", rel_me_urls: hrefs) + + assert {:ok, %{rel_me: %{urls: hrefs, verified: true}}} = + Unfurl.unfurl("http://example.com/rel_me/link", rel_me_urls: hrefs) + + assert {:ok, %{rel_me: %{urls: hrefs, verified: true}}} = + Unfurl.unfurl("http://example.com/rel_me/anchor", rel_me_urls: hrefs) + + assert {:ok, %{rel_me: %{urls: hrefs, verified: true}}} = + Unfurl.unfurl("http://example.com/rel_me/anchor_nofollow", rel_me_urls: hrefs) + end + end +end diff --git a/test/furlex/parser/twitter_test.exs b/test/furlex/parser/twitter_test.exs index adf2768..07a6be1 100644 --- a/test/furlex/parser/twitter_test.exs +++ b/test/furlex/parser/twitter_test.exs @@ -1,7 +1,7 @@ -defmodule Furlex.Parser.TwitterTest do +defmodule Unfurl.Parser.TwitterTest do use ExUnit.Case - alias Furlex.Parser.Twitter + alias Unfurl.Parser.Twitter doctest Twitter @@ -12,9 +12,7 @@ defmodule Furlex.Parser.TwitterTest do assert {:ok, %{ - "twitter" => %{ - "image" => "www.example.com" - } + "image" => "www.example.com" }} == Twitter.parse(html) end end diff --git a/test/furlex/parser_test.exs b/test/furlex/parser_test.exs index 709b4f8..ae13862 100644 --- a/test/furlex/parser_test.exs +++ b/test/furlex/parser_test.exs @@ -1,12 +1,12 @@ -defmodule Furlex.ParserTest do +defmodule Unfurl.ParserTest do use ExUnit.Case - alias Furlex.Parser + alias Unfurl.Parser doctest Parser setup do - Application.put_env(:furlex, :group_keys?, true) + Application.put_env(:unfurl, :group_keys?, true) end test "extracts tags from html" do @@ -37,7 +37,7 @@ defmodule Furlex.ParserTest do "twitter:card" => "player" } - result = Parser.group_keys(map) + result = Parser.maybe_group_keys(map) assert result == %{ "twitter" => %{ diff --git a/test/test_helper.exs b/test/test_helper.exs index 1a33fff..c238992 100644 --- a/test/test_helper.exs +++ b/test/test_helper.exs @@ -1,2 +1,3 @@ ExUnit.start() {:ok, _} = Application.ensure_all_started(:bypass) +{:ok, _} = Application.ensure_all_started(:hackney) diff --git a/test/furlex_test.exs b/test/unfurl_test.exs similarity index 57% rename from test/furlex_test.exs rename to test/unfurl_test.exs index a89c91a..5cc1dce 100644 --- a/test/furlex_test.exs +++ b/test/unfurl_test.exs @@ -1,18 +1,18 @@ -defmodule FurlexTest do +defmodule UnfurlTest do use ExUnit.Case setup do bypass = Bypass.open() url = "http://localhost:#{bypass.port}" - oembed = Furlex.Oembed - oembed_config = Application.get_env(:furlex, oembed, []) + oembed = Unfurl.Oembed + oembed_config = Application.get_env(:unfurl, oembed, []) new_config = Keyword.put(oembed_config, :oembed_host, url) - Application.put_env(:furlex, oembed, new_config) + Application.put_env(:unfurl, oembed, new_config) on_exit(fn -> - Application.put_env(:furlex, oembed, oembed_config) + Application.put_env(:unfurl, oembed, oembed_config) :ok end) @@ -20,15 +20,18 @@ defmodule FurlexTest do {:ok, bypass: bypass, url: url} end + @tag :fixme test "unfurls a url", %{bypass: bypass, url: url} do Bypass.expect(bypass, &handle/1) - assert {:ok, %Furlex{} = furlex} = Furlex.unfurl(url) + assert {:ok, %{} = unfurl} = + Unfurl.unfurl(url) + |> IO.inspect() - assert furlex.status_code == 200 - assert furlex.facebook["og"]["site_name"] == "Vimeo" - assert furlex.twitter["twitter"]["title"] == "FIDLAR - Cocaine (Feat. Nick Offerman)" - assert Enum.at(furlex.json_ld, 0)["@type"] == "VideoObject" + assert unfurl.status_code == 200 + assert unfurl.facebook["site_name"] == "Vimeo" + assert unfurl.twitter["title"] == "FIDLAR - Cocaine (Feat. Nick Offerman)" + assert Enum.at(unfurl.json_ld, 0)["@type"] == "VideoObject" end def handle(%{request_path: "/providers.json"} = conn) do