Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
121 changes: 94 additions & 27 deletions lib/sanbase/external_services/etherscan/scraper.ex
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,6 @@ defmodule Sanbase.ExternalServices.Etherscan.Scraper do
| total_supply: total_supply(html) || project_info.total_supply,
main_contract_address: project_info.main_contract_address || main_contract_address(html),
token_decimals: project_info.token_decimals || token_decimals(html),
website_link: project_info.website_link || website_link(html),
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this is removed as I couldn't find a proper way to extract the website URL from the HTML.

email: project_info.email || official_link(html, "Email") |> email(),
reddit_link: project_info.reddit_link || official_link(html, "Reddit"),
twitter_link: project_info.twitter_link || official_link(html, "Twitter"),
Expand All @@ -89,16 +88,67 @@ defmodule Sanbase.ExternalServices.Etherscan.Scraper do
}
end

defp website_link(html) do
Floki.find(html, ~s/#ContentPlaceHolder1_tr_officialsite_1 > div > div.col-md-8 > a/)
|> Floki.attribute("href")
|> List.first()
end

defp official_link(html, media) do
Floki.find(html, ~s/a[data-original-title^="#{media}:"]/)
|> Floki.attribute("href")
|> List.first()
case media do
"Reddit" ->
Floki.find(html, "a[href*='reddit.com']")
|> Floki.attribute("href")
|> List.first()

"Twitter" ->
Floki.find(html, "a[href*='twitter.com'], a[href*='x.com']")
|> Floki.attribute("href")
|> List.first()

"Bitcointalk" ->
Floki.find(html, "a[href*='bitcointalk.org']")
|> Floki.attribute("href")
|> List.first()

"Blog" ->
Floki.find(html, "a[href*='blog'], a[href*='medium.com'], a[href*='substack.com']")
|> Enum.find(fn link ->
href = Floki.attribute(link, "href") |> List.first()
href && !String.contains?(href, "etherscan-blog")
end)
|> case do
nil -> nil
link -> Floki.attribute(link, "href") |> List.first()
end
Copy link

Copilot AI Aug 21, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The Floki.find/2 function returns a list of elements, but Enum.find/2 expects each element to be a complete HTML element. However, the lambda function tries to extract the 'href' attribute from 'link', but 'link' might not be in the expected format. Consider using Floki.attribute/2 on the entire list first, then filtering the URLs.

Suggested change
end
|> Floki.attribute("href")
|> Enum.find(fn href ->
href && !String.contains?(href, "etherscan-blog")
end)

Copilot uses AI. Check for mistakes.

"Github" ->
Floki.find(html, "a[href*='github.com']")
|> Floki.attribute("href")
|> List.first()

"Telegram" ->
Floki.find(html, "a[href*='t.me'], a[href*='telegram.me']")
|> Floki.attribute("href")
|> List.first()

"Slack" ->
Floki.find(html, "a[href*='slack.com'], a[href*='slack.']")
|> Floki.attribute("href")
|> List.first()

"Facebook" ->
Floki.find(html, "a[href*='facebook.com']")
|> Floki.attribute("href")
|> List.first()

"Whitepaper" ->
Floki.find(html, "a[href*='whitepaper'], a[href*='.pdf']")
|> Floki.attribute("href")
|> List.first()

"Email" ->
Floki.find(html, "a[href^='mailto:']")
|> Floki.attribute("href")
|> List.first()

_ ->
nil
end
end

defp email("mailto:" <> email), do: email
Expand All @@ -117,38 +167,56 @@ defmodule Sanbase.ExternalServices.Etherscan.Scraper do
end

defp total_supply(html) do
# TODO: 21.05.2018 Lyudmil Lesinksi
# The real css selector shoul be "#ContentPlaceHolder1_divSummary > div:first-child tr:first-child > td + td"
# but for some reason Floki doesn't recognize that as the valid selector so we have to use Enum.at
Floki.find(html, ~s/#ContentPlaceHolder1_divSummary > div:first-child div > div + div/)
|> Enum.at(0)
# Look for the total supply in the hidden input field
Floki.find(html, "input[id*='TotalSupply']")
|> Floki.attribute("value")
|> List.first()
|> case do
nil ->
nil

match ->
Floki.text(match)
value ->
value
|> parse_total_supply()
|> Decimal.round()
|> Decimal.to_integer()
end
end

defp main_contract_address(html) do
Floki.find(html, ~s/div:fl-contains('Contract') + div/)
Floki.find(html, "i[aria-label=\"Contract\"] + a[href*='/address/']")
|> List.first()
|> case do
nil ->
Floki.find(html, "h4:contains('Token Contract') + div a[href*='/address/']")
|> List.first()

match ->
match
end
|> case do
nil -> nil
match -> Floki.text(match)
match -> Floki.text(match) |> String.trim()
end
end

defp token_decimals(html) do
Floki.find(html, ~s/div:fl-contains('Decimals') + div/)
|> List.first()
html
|> Floki.find("h4")
|> Enum.find(fn h4 ->
Floki.text(h4) |> String.contains?("Token Contract")
end)
|> case do
nil -> nil
match -> Floki.text(match) |> parse_token_decimals()
nil ->
nil

h4 ->
Floki.find(h4, "b")
Copy link

Copilot AI Aug 21, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Using Floki.find/2 on an h4 element to find nested 'b' tags may not work as expected. The h4 variable contains a single HTML element, but Floki.find/2 typically expects an HTML document or fragment. Consider using a different approach to extract the bold text from within the h4 element.

Suggested change
Floki.find(h4, "b")
Floki.find([h4], "b")

Copilot uses AI. Check for mistakes.
|> List.first()
|> case do
nil -> nil
b -> Floki.text(b) |> parse_token_decimals()
end
end
end

Expand All @@ -162,13 +230,12 @@ defmodule Sanbase.ExternalServices.Etherscan.Scraper do

defp parse_total_supply(""), do: nil

defp parse_total_supply(total_supply) do
defp parse_total_supply(total_supply) when is_binary(total_supply) do
total_supply
|> String.trim()
|> String.replace(",", "")
|> String.split()
|> Enum.find(fn x -> String.starts_with?(x, "Supply") end)
|> (fn supply -> String.trim(supply, "Supply:") end).()
|> Decimal.new()
Copy link

Copilot AI Aug 21, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The function removes the logic that previously parsed 'Supply:' prefix from the total supply string but retains the binary guard. This could cause parsing issues if the input format still contains text prefixes that need to be stripped before converting to Decimal.

Copilot uses AI. Check for mistakes.
end

defp parse_total_supply(_), do: nil
end
1 change: 0 additions & 1 deletion test/sanbase/external_services/etherscan/scraper_test.exs
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,6 @@ defmodule Sanbase.ExternalServices.Etherscan.ScraperTest do
total_supply: 6_804_870_175,
main_contract_address: "0x744d70fdbe2ba4cf95131626614a1763df805b9e",
token_decimals: 18,
website_link: "https://status.im/",
email: nil,
reddit_link: "https://www.reddit.com/r/statusim/",
twitter_link: "https://twitter.com/ethstatus",
Expand Down
Loading