parser.ex 5.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149
  1. defmodule PodcastFeed.Provider.Archive.Parser do
  2. alias PodcastFeed.Utility.Format
  3. @extra_metadata_url "https://archive.org/download/{identifier}/metadata.json"
  4. @archive_metadata_url "http://archive.org/metadata/{identifier}"
  5. @download_url "https://archive.org/download/{identifier}/{filename}"
  6. @extra_metadata_defaults %{
  7. "link" => "",
  8. "image" => %{
  9. "url" => "",
  10. "title" => "",
  11. "link" => "",
  12. },
  13. "category" => "",
  14. "explicit" => "",
  15. }
  16. def by_identifier(identifier) do
  17. extra_metadata_json = fetch_extra_metadata(identifier)
  18. metadata_json = fetch_archive_metadata(identifier)
  19. parse(identifier, metadata_json, extra_metadata_json)
  20. end
  21. defp fetch_extra_metadata(identifier) do
  22. extra_metadata_url = Format.compile(@extra_metadata_url, identifier: identifier)
  23. parse_extra_metadata_response(:hackney.get(extra_metadata_url, [], "", [follow_redirect: true]))
  24. end
  25. defp parse_extra_metadata_response({:ok, 200, _headers, client_ref}) do
  26. {:ok, extra_metadata_json} = :hackney.body(client_ref)
  27. extra_metadata_json
  28. |> String.split("\n")
  29. |> Enum.join()
  30. |> Poison.decode!()
  31. end
  32. defp parse_extra_metadata_response(_), do: @extra_metadata_defaults
  33. defp fetch_archive_metadata(identifier) do
  34. metadata_url = Format.compile(@archive_metadata_url, identifier: identifier)
  35. {:ok, 200, _headers, client_ref} = :hackney.get(metadata_url, [], "", [follow_redirect: true, connect_timeout: 30000, recv_timeout: 30000])
  36. {:ok, metadata_json} = :hackney.body(client_ref)
  37. metadata_json |> Poison.decode!()
  38. end
  39. def parse(identifier, metadata = %{"files" => files}, extra) do
  40. extra = files
  41. |> fetch_cover(identifier)
  42. |> enrich_extra_metadata_with_cover(extra)
  43. %{podcast: podcast_data(metadata, extra), items: items_data(metadata, identifier)}
  44. end
  45. # cover is nil
  46. defp enrich_extra_metadata_with_cover(nil, extra), do: extra
  47. # cover is found and image is missing in the extra_metadata
  48. defp enrich_extra_metadata_with_cover(cover, extra = %{"image" => %{"url" => ""}}) do
  49. put_in(extra, ["image", "url"], cover)
  50. end
  51. # image is already set in the extra_metadata
  52. defp enrich_extra_metadata_with_cover(_cover, extra), do: extra
  53. defp podcast_data(%{"metadata" => metadata, "item_last_updated" => last_updated}, extra) do
  54. %{
  55. title: metadata["title"],
  56. description: metadata["description"],
  57. webmaster: metadata["uploader"],
  58. managingEditor: metadata["uploader"],
  59. owner: %{
  60. name: metadata["creator"],
  61. email: metadata["uploader"],
  62. },
  63. keywords: metadata["subject"],
  64. pubDate: metadata["publicdate"] |> NaiveDateTime.from_iso8601!() |> DateTime.from_naive!("Etc/UTC"),
  65. lastBuildDate: last_updated |> DateTime.from_unix!(:second),
  66. author: metadata["creator"],
  67. language: metadata["language"],
  68. image: %{
  69. url: extra["image"]["url"],
  70. title: extra["image"]["title"],
  71. link: extra["image"]["link"],
  72. },
  73. link: extra["link"],
  74. category: extra["category"],
  75. explicit: extra["explicit"],
  76. }
  77. end
  78. defp items_data(%{"files" => files}, identifier) do
  79. files
  80. |> filter_audio_files()
  81. |> Enum.map(fn f -> to_feed_item(f, identifier, files) end)
  82. end
  83. defp filter_audio_files(files) do
  84. files |> Enum.filter(fn f -> Map.get(f, "format") =~ ~r/MP3/i end) #FIXME:! mp3, ogg, boh
  85. end
  86. defp to_feed_item(file, identifier, files) do
  87. filename = Map.get(file, "name")
  88. %{
  89. title: file["title"],
  90. description: "",
  91. pubDate: file |> Map.get("mtime") |> Integer.parse() |> elem(0) |> DateTime.from_unix!(:second),
  92. link: download_url(identifier, filename),
  93. length: (file |> Map.get("length") |> Float.parse() |> elem(0)) |> trunc(),
  94. size: file |> Map.get("size"),
  95. summary: "",
  96. image: download_url(identifier, fetch_image_of_audio(filename, files)),
  97. keywords: file |> Map.take(["album", "artist", "genre"]) |> Map.values(),
  98. explicit: "no",
  99. }
  100. end
  101. defp fetch_cover(files, identifier) do
  102. filename = files
  103. |> Enum.filter(fn f -> f["source"] == "original" end)
  104. |> Enum.filter(fn f -> f["format"] == "JPEG" end) #FIXME:! jpg, png, gif
  105. |> List.first()
  106. |> case do
  107. nil -> nil
  108. file -> Map.get(file, "name")
  109. end
  110. download_url(identifier, filename)
  111. end
  112. defp fetch_image_of_audio(audio_file, files) do
  113. files
  114. |> Enum.filter(fn
  115. %{"format" => format, "source" => "derivative", "original" => ^audio_file} ->
  116. format =~ ~r/JPG|JPEG|PNG|GIF/i
  117. _ -> nil
  118. end)
  119. |> fetch_image_of_audio()
  120. end
  121. defp fetch_image_of_audio(image_files) when is_list(image_files), do: fetch_image_of_audio(List.first(image_files))
  122. defp fetch_image_of_audio(nil), do: nil
  123. defp fetch_image_of_audio(image_file), do: image_file |> Map.get("name", nil)
  124. defp download_url(_identifier, nil), do: nil
  125. defp download_url(identifier, filename) do
  126. Format.compile(@download_url, identifier: identifier, filename: filename) |> URI.encode()
  127. end
  128. end