Skip to content

Commit

Permalink
update
Browse files Browse the repository at this point in the history
  • Loading branch information
Itsoon committed Mar 8, 2024
1 parent 3602fba commit 1723492
Show file tree
Hide file tree
Showing 7 changed files with 50 additions and 66 deletions.
3 changes: 1 addition & 2 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,2 +1 @@
/extracted/*
!/extracted/DO NOT DELETE THIS FOLDER
extracted/*
6 changes: 0 additions & 6 deletions .vscode/settings.json

This file was deleted.

8 changes: 6 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# do not use
# do not use (it works, but it's slow, not optimized and the code is ugly.)
# Favicon extractor

## Compatibility :
Expand Down Expand Up @@ -71,7 +71,11 @@ pip install -r requirements.txt

## To do

- [ ] check_url more tests with different urls
- [ ] why selenium
- [ ] log
- [ ] argparse
- [ ] threading
- [ ] fix check_url

add this to url search :

Expand Down
Empty file.
95 changes: 41 additions & 54 deletions favicon_extractor_gecko.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,20 +11,20 @@
from selenium.webdriver.support import expected_conditions as EC
import threading


print_error = False


options = Options()
options.add_argument("--headless")
driver = webdriver.Firefox(options=options)
# driver = webdriver.Firefox()

headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36"
}


def write_logs(url, e):
with open('log.txt', 'a') as f:
f.write(f"{url} ==> {e}\n")


def check_url(url):
try:
response = requests.head(url, headers=headers)
Expand All @@ -41,18 +41,16 @@ def check_url(url):
return True

except Exception as e:
write_logs(url, e)
print(
f"{colored(' ==>', 'light_red')} CheckUrl error = {url} code = {colored(e, 'light_red', attrs=['underline'])}")
f"{colored(' ==>', 'light_red')} CheckUrl error = {url} code = {colored(e, 'light_red', attrs=['underline'])}"
)
return False


def download(url, name, counter):
try:
try:
start_time = time.time()
icon_response = requests.get(url, headers=headers, timeout=4)
elapsed_time = time.time() - start_time
except:
icon_response = requests.get(url)

Expand All @@ -65,22 +63,13 @@ def download(url, name, counter):
path = parsed_url.path
file_extension = os.path.splitext(path)[1]

with open('extracted/' + name_ + file_extension, 'wb') as file:
with open("extracted/" + name_ + file_extension, "wb") as file:
file.write(icon_response.content)
print(f"{colored(' ==>', 'green')} Successful download {url}")
except Exception as e:
write_logs("Failed download error for URL for " + url, e)
print(
f"{colored(' ==>', 'light_red')} Failed download error for URL = {url} code = {colored(e, 'light_red', attrs=['underline'])}")


def download_threaded(name, url, counter):
try:
download(url, name, counter)
except Exception as e:
write_logs(url, e)
print(
f"{colored(' ==>', 'light_red')} Error in thread for URL = {url} code = {colored(e, 'light_red', attrs=['underline'])}")
f"{colored(' ==>', 'light_red')} Failed download error for URL = {url} code = {colored(e, 'light_red', attrs=['underline'])}"
)


print(
Expand All @@ -92,88 +81,86 @@ def download_threaded(name, url, counter):
██║ ██║ ██║ ╚████╔╝ ██║ ╚██████╗ ╚██████╔╝ ██║ ╚████║
╚═╝ ╚═╝ ╚═╝ ╚═══╝ ╚═╝ ╚═════╝ ╚═════╝ ╚═╝ ╚═══╝
""", "01000110 01100001 01110110 01101001 01100011 01101111 01101110 ")
print("\n")
""",
"01000110 01100001 01110110 01101001 01100011 01101111 01101110 ", "\n"
)

with open('url.json') as f:
with open("url.json") as f:
data = json.load(f)

threads = []

for name, url in data["sites"].items():
try:
# print(
# f"{colored('::', 'light_blue', attrs=['bold'])} Start with {url}")

if check_url(url):

def getUrl(url):
driver.get(url)
try:
favicon = driver.find_element(
By.CSS_SELECTOR, 'link[rel="icon"]')
if favicon.get_attribute('href') in ['data:,', '']:
favicon = driver.find_element(By.CSS_SELECTOR, 'link[rel="icon"]')
if favicon.get_attribute("href") in ["data:,", ""]:

favicon = WebDriverWait(driver, 30).until(
EC.presence_of_element_located(
(By.CSS_SELECTOR, 'link[rel="icon"]'))
(By.CSS_SELECTOR, 'link[rel="icon"]')
)
)

WebDriverWait(driver, 30).until(
lambda driver: favicon.get_attribute('href') not in [
'data:,', '']
lambda driver: favicon.get_attribute("href")
not in ["data:,", ""]
)
except Exception as e:
if print_error:
print(e)
pass
tags = driver.find_elements(
By.CSS_SELECTOR, "link[rel*='icon']")
tags = driver.find_elements(By.CSS_SELECTOR, "link[rel*='icon']")
return tags

link_tags = getUrl(url)

if link_tags == []:
# print(colored(" ==> retry with url search", "red"))
if check_url(url + "/favicon.ico"):
counter = 0
download(url + "/favicon.ico", name, counter)
continue
else:
write_logs(
url, "the site does not contain a favicon, or there is a problem in the script")
print(
f"{colored(' ==>', 'light_red')} Error for URL = {url} code = {colored('the site does not contain a favicon, or there is a problem in the script', 'light_red', attrs=['underline'])}")

icon_types = [['icon'], ['shortcut', 'icon'], [
'apple-touch-icon', 'apple-touch-icon-precomposed'], ['fluid-icon', 'mask-icon'], ['msapplication-TileImage']]
f"{colored(' ==>', 'light_red')} Error for URL = {url} code = {colored('the site does not contain a favicon, or there is a problem in the script', 'light_red', attrs=['underline'])}"
)

icon_types = [
["icon"],
["shortcut", "icon"],
["apple-touch-icon", "apple-touch-icon-precomposed"],
["fluid-icon", "mask-icon"],
["msapplication-TileImage"],
]
counter = 0

for link_tag in link_tags:
rels = link_tag.get_attribute('rel').split()
rels = link_tag.get_attribute("rel").split()
rels = [rel.lower() for rel in rels]
for icon_type in icon_types:
if all(rel in rels for rel in icon_type):
icon_url = link_tag.get_attribute('href')
thread = threading.Thread(
target=download_threaded, args=(name, icon_url, counter))
threads.append(thread)
thread.start()
icon_url = link_tag.get_attribute("href")
download(icon_url, name, counter)
counter += 1
break

else:
write_logs(url, "invalid url")
print(
f"{colored(' ==>', 'light_red')} Error for URL = {url} code = {colored('invalid url', 'light_red', attrs=['underline'])}")
f"{colored(' ==>', 'light_red')} Error for URL = {url} code = {colored('invalid url', 'light_red', attrs=['underline'])}"
)

except Exception as e:
write_logs(url, e)
print(
f"{colored(' ==>', 'light_red')} Error for URL = {url} code = {colored(e, 'light_red', attrs=['underline'])}")
f"{colored(' ==>', 'light_red')} Error for URL = {url} code = {colored(e, 'light_red', attrs=['underline'])}"
)

for thread in threads:
thread.join()

print(colored('Successful download', 'green'),
' look at log.txt to see errors')
print(colored("Successful download", "green"))

driver.quit()
Empty file removed log.txt
Empty file.
4 changes: 2 additions & 2 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
python-time
requests
termcolor
urllib3
selenium
colorama
selenium

0 comments on commit 1723492

Please sign in to comment.