open-pod/lib/podcast_feed/provider/archive/parser.ex

160 lines
5.5 KiB
Elixir
Raw Normal View History

2020-05-22 22:04:15 +02:00
defmodule PodcastFeed.Provider.Archive.Parser do
@moduledoc """
This module provides a public API for fetching data from archive.org and convert them
in a common podcast data structures.
"""
alias PodcastFeed.Utility.Format
2020-05-26 20:53:59 +02:00
alias __MODULE__
2020-05-26 20:53:59 +02:00
@custom_metadata_url "https://archive.org/download/{identifier}/metadata.json"
2020-05-25 21:39:04 +02:00
@archive_metadata_url "http://archive.org/metadata/{identifier}"
@download_url "https://archive.org/download/{identifier}/{filename}"
2020-05-26 20:53:59 +02:00
@podcast_link "https://archive.org/details/{identifier}"
2020-05-26 20:53:59 +02:00
@custom_metadata_defaults %{
"link" => nil,
2020-05-25 21:39:04 +02:00
"image" => %{
2020-05-26 20:53:59 +02:00
"url" => nil,
"title" => nil,
"link" => nil,
2020-05-25 21:39:04 +02:00
},
"category" => "",
2020-05-26 20:53:59 +02:00
"explicit" => "no",
"version" => "1",
2020-05-25 21:39:04 +02:00
}
2020-05-26 20:53:59 +02:00
@enforce_keys [:identifier]
defstruct [:identifier, :podcast_data, :archive_metadata, custom_metadata: @custom_metadata_defaults]
2020-05-25 21:39:04 +02:00
2020-05-26 20:53:59 +02:00
def by_identifier(identifier) do
%Parser{identifier: identifier}
|> enrich_with_archive_metadata()
|> enrich_with_custom_metadata()
|> to_podcast_feed_data()
2020-05-22 22:04:15 +02:00
end
2020-05-26 20:53:59 +02:00
def to_podcast_feed_data(token) do
%{
podcast: podcast_data(token),
items: items_data(token)
}
2020-05-22 22:04:15 +02:00
end
2020-05-26 20:53:59 +02:00
defp podcast_data(token = %{custom_metadata: custom, archive_metadata: %{"metadata" => metadata, "item_last_updated" => last_updated}}) do
link = Format.compile(@podcast_link, identifier: token.identifier)
%{
title: metadata["title"],
description: metadata["description"],
webmaster: metadata["uploader"],
managingEditor: metadata["uploader"],
owner: %{
name: metadata["creator"],
email: metadata["uploader"],
},
keywords: metadata["subject"],
pubDate: metadata["publicdate"] |> NaiveDateTime.from_iso8601!() |> DateTime.from_naive!("Etc/UTC"),
lastBuildDate: last_updated |> DateTime.from_unix!(:second),
author: metadata["creator"],
language: metadata["language"],
image: %{
2020-05-26 20:53:59 +02:00
url: get_in(custom, ["image", "url"]) || fetch_cover(token),
title: get_in(custom, ["image", "title"]) || metadata["title"],
link: get_in(custom, ["image", "link"]) || link,
},
2020-05-26 20:53:59 +02:00
link: Map.get(custom, "link") || link,
category: Map.get(custom, "category", ""),
explicit: Map.get(custom, "explicit", "no"),
}
2020-05-22 22:04:15 +02:00
end
2020-05-26 20:53:59 +02:00
defp items_data(%{identifier: identifier, archive_metadata: %{"files" => files}}) do
2020-05-22 22:04:15 +02:00
files
|> filter_audio_files()
2020-05-24 23:37:18 +02:00
|> Enum.map(fn f -> to_feed_item(f, identifier, files) end)
2020-05-22 22:04:15 +02:00
end
2020-05-26 20:53:59 +02:00
defp fetch_custom_metadata(identifier) do
custom_metadata_url = Format.compile(@custom_metadata_url, identifier: identifier)
parse_custom_metadata_response(:hackney.get(custom_metadata_url, [], "", [follow_redirect: true]))
end
defp parse_custom_metadata_response({:ok, 200, _headers, client_ref}) do
{:ok, custom_metadata_json} = :hackney.body(client_ref)
custom_metadata_json
|> String.split("\n")
|> Enum.join()
|> Jason.decode!()
2020-05-26 20:53:59 +02:00
end
defp parse_custom_metadata_response(_), do: @custom_metadata_defaults
defp fetch_archive_metadata(identifier) do
metadata_url = Format.compile(@archive_metadata_url, identifier: identifier)
{:ok, 200, _headers, client_ref} = :hackney.get(metadata_url, [], "", [follow_redirect: true, connect_timeout: 30_000, recv_timeout: 30_000])
2020-05-26 20:53:59 +02:00
{:ok, metadata_json} = :hackney.body(client_ref)
metadata_json |> Jason.decode!()
2020-05-26 20:53:59 +02:00
end
defp filter_audio_files(files) do
files |> Enum.filter(fn f -> Map.get(f, "format") =~ ~r/MP3/i end) #FIXME:! mp3, ogg, boh
end
2020-05-22 22:04:15 +02:00
2020-05-26 20:53:59 +02:00
defp to_feed_item(file, identifier, _files) do
filename = Map.get(file, "name")
2020-05-22 22:04:15 +02:00
%{
title: file["title"],
2020-05-22 22:04:15 +02:00
description: "",
pubDate: file |> Map.get("mtime") |> Integer.parse() |> elem(0) |> DateTime.from_unix!(:second),
link: download_url(identifier, filename),
2020-05-24 23:01:58 +02:00
length: (file |> Map.get("length") |> Float.parse() |> elem(0)) |> trunc(),
2020-05-24 21:54:35 +02:00
size: file |> Map.get("size"),
2020-05-22 22:04:15 +02:00
summary: "",
2020-05-26 20:53:59 +02:00
# image: download_url(identifier, fetch_image_of_audio(filename, files)),
image: nil,
2020-05-22 22:04:15 +02:00
keywords: file |> Map.take(["album", "artist", "genre"]) |> Map.values(),
explicit: "no",
}
end
2020-05-24 23:37:18 +02:00
2020-05-26 20:53:59 +02:00
defp fetch_cover(%{identifier: identifier, archive_metadata: %{"files" => files}}) do
2020-05-25 21:39:04 +02:00
filename = files
|> Enum.filter(fn f -> f["source"] == "original" end)
2020-05-26 20:53:59 +02:00
|> Enum.filter(fn f -> f["format"] =~ ~r/JPG|JPEG|PNG|GIF/i end)
2020-05-25 21:39:04 +02:00
|> List.first()
|> case do
nil -> nil
file -> Map.get(file, "name")
end
download_url(identifier, filename)
end
2020-05-26 20:53:59 +02:00
# defp fetch_image_of_audio(audio_file, files) do
# files
# |> Enum.filter(fn
# %{"format" => format, "source" => "derivative", "original" => ^audio_file} ->
# format =~ ~r/JPG|JPEG|PNG|GIF/i
# _ -> nil
# end)
# |> fetch_image_of_audio()
# end
2020-05-26 20:53:59 +02:00
# defp fetch_image_of_audio(image_files) when is_list(image_files), do: fetch_image_of_audio(List.first(image_files))
# defp fetch_image_of_audio(nil), do: nil
# defp fetch_image_of_audio(image_file), do: image_file |> Map.get("name", nil)
defp download_url(_identifier, nil), do: nil
defp download_url(identifier, filename) do
Format.compile(@download_url, identifier: identifier, filename: filename) |> URI.encode()
2020-05-24 23:37:18 +02:00
end
2020-05-26 20:53:59 +02:00
defp enrich_with_archive_metadata(token) do
%Parser{token | archive_metadata: fetch_archive_metadata(token.identifier)}
2020-05-26 20:53:59 +02:00
end
defp enrich_with_custom_metadata(token) do
%Parser{token | custom_metadata: fetch_custom_metadata(token.identifier)}
2020-05-26 20:53:59 +02:00
end
2020-05-22 22:04:15 +02:00
end