iagox86 · Harsh-Avinash · Apr 26, 2022
diff --git a/exam.py b/exam.py
@@ -0,0 +1,14 @@
+def fun(num, ids, rem):
+    h={}
+    for i in ids:
+        try:
+            h[i]+=1
+        except:
+            h[i]=1
+    h={k:v for k,v in sorted(h.items(), key=lambda item:item[1])}
+    temp=[]
+    for key in h.keys():
+            temp+=[key]*h[key]
+    print(temp[rem:])
+    return len(set(temp[rem:]))
+print(fun(6,[1,1,1,2,3,2],2))
diff --git a/features_extraction.py b/features_extraction.py
@@ -0,0 +1,228 @@
+from bs4 import BeautifulSoup
+import urllib
+import bs4
+import re
+import socket
+import whois
+from datetime import datetime
+import time
+from googlesearch import search
+import sys
+from patterns import *
+
+
+def having_ip_address(url):
+    patt = "(25[0-5]|2[0-4][0-9]|1[0-9][0-9]|[0-9]?[0-9])(\.|$){4}"
+    patt2 = "(0x([0-9][0-9]|[A-F][A-F]|[A-F][0-9]|[0-9][A-F]))(\.|$){4}"
+    ip = "\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}"
+
+    match = re.search(ip, url) or re.search(patt, url) or re.search(patt, url)
+    return 1 if match else -1
+
+def url_length(url):
+    if len(url) < 54:
+        return 1
+    if 54 <= len(url) and len(url)<= 75:
+        return 0
+    return 1
+
+def tiny_url(url):
+    url.replace("www.",'')
+    if len(url)<7:
+        return 1
+    return -1 
+
+def having_at_symbol(url):
+    match = re.search('@', url)
+    return 1 if match else -1
+
+def double_slash_redirecting(url):
+    last_double_slash = url.rfind('//')
+    return 1 if (last_double_slash > 7 or last_double_slash == -1)else -1
+
+def prefix_suffix(domain):
+    match = re.search('-', domain)
+    return 1 if match else -1
+
+def having_sub_domain(url):
+    if having_ip_address(url) == -1:
+        match = re.search(
+            '(([01]?\\d\\d?|2[0-4]\\d|25[0-5])\\.([01]?\\d\\d?|2[0-4]\\d|25[0-5])\\.([01]?\\d\\d?|2[0-4]\\d|25[0-5])\\.'
+            '([01]?\\d\\d?|2[0-4]\\d|25[0-5]))|(?:[a-fA-F0-9]{1,4}:){7}[a-fA-F0-9]{1,4}',
+            url)
+
+        if match:
+            pos = match.end()
+            url = url[pos:]
+    num_dots = [x.start() for x in re.finditer(r'\.', url)]
+    if len(num_dots) <= 3:
+        return 1
+    elif len(num_dots) == 4:
+        return 0
+    else:
+        return -1
+
+def favicon(wiki, soup, domain):
+    for head in soup.find_all('head'):
+        for head.link in soup.find_all('link', href=True):
+            dots = [x.start() for x in re.finditer(r'\.', head.link['href'])]
+            return 1 if wiki in head.link['href'] or len(dots) == 1 or domain in head.link['href'] else -1
+    return 1
+
+def https_token(url):
+    http_https = "^(http|https)://.*"
+    match = re.search(http_https, url)
+    if match and match.start() == 0:
+        url = url[match.end():]
+    match = re.search('http|https', url)
+    return -1 if match else 1
+
+def request_url(wiki, soup, domain):
+    i = 0
+    success = 0
+    for img in soup.find_all('img', src=True):
+        dots = [x.start() for x in re.finditer(r'\.', img['src'])]
+        if wiki in img['src'] or domain in img['src'] or len(dots) == 1:
+            success = success + 1
+        i = i + 1
+    for audio in soup.find_all('audio', src=True):
+        dots = [x.start() for x in re.finditer(r'\.', audio['src'])]
+        if wiki in audio['src'] or domain in audio['src'] or len(dots) == 1:
+            success = success + 1
+        i= i + 1
+    for embed in soup.find_all('embed', src=True):
+        dots = [x.start() for x in re.finditer(r'\.', embed['src'])]
+        if wiki in embed['src'] or domain in embed['src'] or len(dots) == 1:
+            success = success + 1
+        i = i + 1
+
+    for i_frame in soup.find_all('i_frame', src=True):
+        dots = [x.start() for x in re.finditer(r'\.', i_frame['src'])]
+        if wiki in i_frame['src'] or domain in i_frame['src'] or len(dots) == 1:
+            success = success + 1
+        i = i + 1
+    try:
+        percentage = success / float(i) * 100
+    except:
+        return 1
+
+    if percentage < 22.0:
+        return 1
+    elif 22.0 <= percentage < 61.0:
+        return 0
+    else:
+        return -1
+
+def url_of_anchor(wiki, soup, domain):
+    i = 0
+    unsafe = 0
+    for a in soup.find_all('a', href=True):
+        if "#" in a['href'] or "javascript" in a['href'].lower() or "mailto" in a['href'].lower() or not (
+                wiki in a['href'] or domain in a['href']):
+            unsafe = unsafe + 1
+        i = i + 1
+    try:
+        percentage = unsafe / float(i) * 100
+    except:
+        return 1
+    if percentage < 31.0:
+        return 1
+    elif 31.0 <= percentage < 67.0:
+        return 0
+    else:
+        return -1
+
+def links_in_tags(wiki, soup, domain):
+    i = 0
+    success = 0
+    for link in soup.find_all('link', href=True):
+        dots = [x.start() for x in re.finditer(r'\.', link['href'])]
+        if wiki in link['href'] or domain in link['href'] or len(dots) == 1:
+            success = success + 1
+        i = i + 1
+
+    for script in soup.find_all('script', src=True):
+        dots = [x.start() for x in re.finditer(r'\.', script['src'])]
+        if wiki in script['src'] or domain in script['src'] or len(dots) == 1:
+            success = success + 1
+        i = i + 1
+    try:
+        percentage = success / float(i) * 100
+    except:
+        return 1
+
+    if percentage < 17.0:
+        return 1
+    elif 17.0 <= percentage < 65.0:
+        return 0
+    else:
+        return -1
+
+def sfh(wiki, soup, domain):
+    for form in soup.find_all('form', action=True):
+        if form['action'] == "" or form['action'] == "about:blank":
+            return -1
+        elif wiki not in form['action'] and domain not in form['action']:
+            return 0
+        else:
+            return 1
+    return 1
+
+def submitting_to_email(soup):
+    for form in soup.find_all('form', action=True):
+        return 1 if "mailto:" in form['action'] else -1
+    return -1
+
+def i_frame(soup):
+    for i_frame in soup.find_all('i_frame', width=True, height=True, frameBorder=True):
+        if i_frame['width'] == "0" and i_frame['height'] == "0" and i_frame['frameBorder'] == "0":
+            return 0
+        if i_frame['width'] == "0" or i_frame['height'] == "0" or i_frame['frameBorder'] == "0":
+            return -1
+    return 1
+
+def port(url):
+    pattern = "https?:\/\/(?:w{1,3}\.)?[^\s.]+(?:\.[a-z]+)*(?::\d+)?(?![^<]*(?:<\/\w+>|\/?>))"
+    match = re.search(pattern, url)
+    return 1 if match else -1
+
+def https_in_url_domain(url):
+    pattern = "^(?:http:\/\/|www\.|https:\/\/)([^\/]+)"
+    match = re.search(pattern, url)
+    return 1 if match else -1
+
+def get_hostname_from_url(url):
+    hostname = url
+    pattern = "https://|http://|www.|https://www.|http://www."
+    pre_pattern_match = re.search(pattern, hostname)
+
+    if pre_pattern_match:
+        hostname = hostname[pre_pattern_match.end():]
+        post_pattern_match = re.search("/", hostname)
+        if post_pattern_match:
+            hostname = hostname[:post_pattern_match.start()]
+
+    return hostname
+def main(url):
+    soup = BeautifulSoup(url, 'html.parser')
+    status = []
+    hostname = get_hostname_from_url(url)
+    status.append(having_ip_address(url))
+    status.append(url_length(url))
+    status.append(tiny_url(url))
+    status.append(having_at_symbol(url))
+    status.append(double_slash_redirecting(url))
+    status.append(prefix_suffix(hostname))
+    status.append(having_sub_domain(url))
+    status.append(https_token(url))
+    status.append(favicon(url, soup, hostname))
+    status.append(port(url))
+    status.append(https_in_url_domain(url))
+    status.append(request_url(url, soup, hostname))
+    status.append(url_of_anchor(url, soup, hostname))
+    status.append(links_in_tags(url, soup, hostname))
+    status.append(sfh(url, soup, hostname))
+    status.append(submitting_to_email(soup))
+    status.append(i_frame(soup))
+    return status
+
diff --git a/run_alg.py b/run_alg.py
@@ -0,0 +1,63 @@
+#!C:\Users\grred\AppData\Local\Programs\Python\Python36\python.exe
+import sys
+import time
+from sklearn import datasets
+from sklearn.feature_selection import VarianceThreshold
+import re
+import sklearn
+from sklearn.naive_bayes import MultinomialNB
+from sklearn.metrics import accuracy_score
+import features_extraction as fe
+import site
+import pandas as pd
+import numpy as np
+site.addsitedir('C:/Users/grred/AppData/Local/Programs/Python/Python36/lib/site-packages')
+print("Content-Type: text/html\n\r\n")
+from flask import Flask
+from flask import jsonify
+from flask import request
+
+app = Flask(__name__)
+
+def random_forests(dataset,class_labels,test_size):
+
+	import numpy as np
+	import pandas as pd
+	from sklearn.model_selection import train_test_split
+	from sklearn.ensemble import RandomForestClassifier
+	from sklearn import metrics
+	X = dataset
+	Y = class_labels
+	X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size= test_size, random_state=42)   	
+
+	model = RandomForestClassifier(n_estimators = 5, criterion = 'entropy',random_state = 42)
+	model.fit(X_train,y_train)
+
+	return model
+
+@app.route('/main', methods=['GET', 'POST'])
+def main():
+	url = request.args.get('url')
+	print(f"url is {url}")
+	#url = "111111111111111111111111111111111111111111111111111111111111.com/"
+	dataset = pd.read_csv("C:/Users/grred/OneDrive/Desktop/datasets/modified_dataset.csv")
+	dataset = dataset.iloc[1:, :-1]
+	class_labels = dataset.iloc[:, -1:]
+	start_time = time.time()
+	model = random_forests(dataset,class_labels,0.3)
+	return_features = fe.main(url) 
+	y_test = np.array(return_features)
+
+	end_time = time.time()
+	print("runtime = "+str(end_time - start_time)+" seconds")
+	result= {
+		'url' : url,
+		'feature_string' : ''.join([str(i) for i in return_features]) ,
+		'run_time' : str(end_time - start_time),
+		'result_label' : str(model.predict(y_test.reshape(1,-1))[0])
+	}
+	return jsonify(code = 0 , msg = result)
+	#return jsonify(code = 0 , msg = "feature string is: "+''.join([str(i) for i in return_features]) + ",  run time is: "+str(end_time - start_time)+" and class variable is: "+str(model.predict(y_test.reshape(1,-1))[0])+" "+url)
+if __name__ == '__main__':
+   app.run(port=5000, debug = True)
+