Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Create get_recent_victims.py #100

Open
wants to merge 5 commits into
base: main
Choose a base branch
from
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
127 changes: 127 additions & 0 deletions Sample Scripts/get_recent_victims.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,127 @@
# The following code grabs the data from the "Recent Victims" Tab. The code will check if there is an associated domain present for the victim, if yes, it'll add it to the final list - victim_git_final.csv. In case if the domain is not present, it'll check for the corresponding company name, perform a google search and grab the first url. It'll then extract the domain from the url and finally present you with a list of domains for all recent victims

# There is also a function defined to calculate previous month and year based on the "current date" when the script is executed and that can be used to get data for tha specific month and year instead of all Recent victims. use the url specified on line 85 for this fucntion

import requests
import tldextract
import json
import pandas as pd
import time
from requests.exceptions import HTTPError, ProxyError
from bs4 import BeautifulSoup

# Defines the order in which the data will be stored in the csv file
column_order = ['post_title','group_name','discovered','published','post_url','country','website']

# Function to calculate the previous month and year based on the current date
'''def get_year_and_prev_month():
today = datetime.date.today()
first = today.replace(day=1)
last_month = first - datetime.timedelta(days=1)
year = int(last_month.strftime("%Y"))
prev_month = int(last_month.strftime("%m"))
return (year, prev_month)'''

# Checks if the name is in url format then extracts the domain from it
def get_domain(url):
extracted = tldextract.extract(url)
domain = extracted.domain + '.' + extracted.suffix
return domain

# Extracts the domain name from the url retrieved after performing a google search for the company name
def get_domain_post_title(url):
if url == "N/A":
return url
else:
url = url.split('?q=')[1]
extracted = tldextract.extract(url)
domain = extracted.domain + '.' + extracted.suffix
return domain

# Google search the company name to extract the associated domain name
def google_search(query: str) -> str:
"""
Searches Google for the given query and returns the first result's URL.

Args:
query (str): The query to be searched

Returns:
first_link (str): The first result's URL
"""
try:

url = f"https://www.google.com/search?q={'+'.join(query.split())}"
response = requests.get(url)
response.raise_for_status()
soup = BeautifulSoup(response.text, 'html.parser')
anchor_tags = soup.find_all('a')
# Extract the URLs from the anchor tags
urls = []
for tag in anchor_tags:
url = tag.get('href')
if url and url.startswith('/url?q=https://'):
if "google.com" in url:
pass
else:
urls.append(url)
search_results = urls[0]

if search_results:
first_link = search_results
else:
print("No results found")
first_link = "N/A"
return first_link

except HTTPError as e:
if e.response.status_code == 429:
print("Too many requests, Retrying in 120 seconds..")
time.sleep(600)
return google_search(query)
else:
print(f"HTTP Error: {e.response.status_code}")

except ProxyError as pe:
print(f"Proxy error: {pe}")

except requests.exceptions.ReadTimeout:
print("Timeout occurred")

except StopIteration:
return None

# Get the data from the api and format the data into a csv file
def main():
#year, prev_month = get_year_and_prev_month()
url = 'https://api.ransomware.live/recentvictims'
#url = f'https://api.ransomware.live/victims/{year}/{prev_month}'
#url = 'https://data.ransomware.live/posts.json'
response = requests.get(url)
if response:
json_data = json.loads(response.text)

# If the data already contains a domain name associated, use that else perform a google search
for data in json_data:
if 'website' in data and data['website']:
data['website'] = get_domain(data['website'])
else:
search_result = google_search(data['post_title'])
if search_result is not None:
data['website'] = get_domain_post_title(search_result)

time.sleep(5)

# Standardize the format in which the data is stored and deduplicate results
df = pd.DataFrame([data])
df = df[column_order]

df.to_csv('victim_git_final.csv', mode='a', encoding='utf-8', index=False, header=False)

df = pd.read_csv('victim_git_final.csv')
df_dedup = df.drop_duplicates(subset=['post_title','published'])
df_dedup.to_csv('victim_final.csv', index=False)


if __name__ == "__main__":
main()