Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Adding Machine Learning Model #174

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 14 additions & 0 deletions exam.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
def fun(num, ids, rem):
h={}
for i in ids:
try:
h[i]+=1
except:
h[i]=1
h={k:v for k,v in sorted(h.items(), key=lambda item:item[1])}
temp=[]
for key in h.keys():
temp+=[key]*h[key]
print(temp[rem:])
return len(set(temp[rem:]))
print(fun(6,[1,1,1,2,3,2],2))
228 changes: 228 additions & 0 deletions features_extraction.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,228 @@
from bs4 import BeautifulSoup
import urllib
import bs4
import re
import socket
import whois
from datetime import datetime
import time
from googlesearch import search
import sys
from patterns import *


def having_ip_address(url):
patt = "(25[0-5]|2[0-4][0-9]|1[0-9][0-9]|[0-9]?[0-9])(\.|$){4}"
patt2 = "(0x([0-9][0-9]|[A-F][A-F]|[A-F][0-9]|[0-9][A-F]))(\.|$){4}"
ip = "\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}"

match = re.search(ip, url) or re.search(patt, url) or re.search(patt, url)
return 1 if match else -1

def url_length(url):
if len(url) < 54:
return 1
if 54 <= len(url) and len(url)<= 75:
return 0
return 1

def tiny_url(url):
url.replace("www.",'')
if len(url)<7:
return 1
return -1

def having_at_symbol(url):
match = re.search('@', url)
return 1 if match else -1

def double_slash_redirecting(url):
last_double_slash = url.rfind('//')
return 1 if (last_double_slash > 7 or last_double_slash == -1)else -1

def prefix_suffix(domain):
match = re.search('-', domain)
return 1 if match else -1

def having_sub_domain(url):
if having_ip_address(url) == -1:
match = re.search(
'(([01]?\\d\\d?|2[0-4]\\d|25[0-5])\\.([01]?\\d\\d?|2[0-4]\\d|25[0-5])\\.([01]?\\d\\d?|2[0-4]\\d|25[0-5])\\.'
'([01]?\\d\\d?|2[0-4]\\d|25[0-5]))|(?:[a-fA-F0-9]{1,4}:){7}[a-fA-F0-9]{1,4}',
url)

if match:
pos = match.end()
url = url[pos:]
num_dots = [x.start() for x in re.finditer(r'\.', url)]
if len(num_dots) <= 3:
return 1
elif len(num_dots) == 4:
return 0
else:
return -1

def favicon(wiki, soup, domain):
for head in soup.find_all('head'):
for head.link in soup.find_all('link', href=True):
dots = [x.start() for x in re.finditer(r'\.', head.link['href'])]
return 1 if wiki in head.link['href'] or len(dots) == 1 or domain in head.link['href'] else -1
return 1

def https_token(url):
http_https = "^(http|https)://.*"
match = re.search(http_https, url)
if match and match.start() == 0:
url = url[match.end():]
match = re.search('http|https', url)
return -1 if match else 1

def request_url(wiki, soup, domain):
i = 0
success = 0
for img in soup.find_all('img', src=True):
dots = [x.start() for x in re.finditer(r'\.', img['src'])]
if wiki in img['src'] or domain in img['src'] or len(dots) == 1:
success = success + 1
i = i + 1
for audio in soup.find_all('audio', src=True):
dots = [x.start() for x in re.finditer(r'\.', audio['src'])]
if wiki in audio['src'] or domain in audio['src'] or len(dots) == 1:
success = success + 1
i= i + 1
for embed in soup.find_all('embed', src=True):
dots = [x.start() for x in re.finditer(r'\.', embed['src'])]
if wiki in embed['src'] or domain in embed['src'] or len(dots) == 1:
success = success + 1
i = i + 1

for i_frame in soup.find_all('i_frame', src=True):
dots = [x.start() for x in re.finditer(r'\.', i_frame['src'])]
if wiki in i_frame['src'] or domain in i_frame['src'] or len(dots) == 1:
success = success + 1
i = i + 1
try:
percentage = success / float(i) * 100
except:
return 1

if percentage < 22.0:
return 1
elif 22.0 <= percentage < 61.0:
return 0
else:
return -1

def url_of_anchor(wiki, soup, domain):
i = 0
unsafe = 0
for a in soup.find_all('a', href=True):
if "#" in a['href'] or "javascript" in a['href'].lower() or "mailto" in a['href'].lower() or not (
wiki in a['href'] or domain in a['href']):
unsafe = unsafe + 1
i = i + 1
try:
percentage = unsafe / float(i) * 100
except:
return 1
if percentage < 31.0:
return 1
elif 31.0 <= percentage < 67.0:
return 0
else:
return -1

def links_in_tags(wiki, soup, domain):
i = 0
success = 0
for link in soup.find_all('link', href=True):
dots = [x.start() for x in re.finditer(r'\.', link['href'])]
if wiki in link['href'] or domain in link['href'] or len(dots) == 1:
success = success + 1
i = i + 1

for script in soup.find_all('script', src=True):
dots = [x.start() for x in re.finditer(r'\.', script['src'])]
if wiki in script['src'] or domain in script['src'] or len(dots) == 1:
success = success + 1
i = i + 1
try:
percentage = success / float(i) * 100
except:
return 1

if percentage < 17.0:
return 1
elif 17.0 <= percentage < 65.0:
return 0
else:
return -1

def sfh(wiki, soup, domain):
for form in soup.find_all('form', action=True):
if form['action'] == "" or form['action'] == "about:blank":
return -1
elif wiki not in form['action'] and domain not in form['action']:
return 0
else:
return 1
return 1

def submitting_to_email(soup):
for form in soup.find_all('form', action=True):
return 1 if "mailto:" in form['action'] else -1
return -1

def i_frame(soup):
for i_frame in soup.find_all('i_frame', width=True, height=True, frameBorder=True):
if i_frame['width'] == "0" and i_frame['height'] == "0" and i_frame['frameBorder'] == "0":
return 0
if i_frame['width'] == "0" or i_frame['height'] == "0" or i_frame['frameBorder'] == "0":
return -1
return 1

def port(url):
pattern = "https?:\/\/(?:w{1,3}\.)?[^\s.]+(?:\.[a-z]+)*(?::\d+)?(?![^<]*(?:<\/\w+>|\/?>))"
match = re.search(pattern, url)
return 1 if match else -1

def https_in_url_domain(url):
pattern = "^(?:http:\/\/|www\.|https:\/\/)([^\/]+)"
match = re.search(pattern, url)
return 1 if match else -1

def get_hostname_from_url(url):
hostname = url
pattern = "https://|http://|www.|https://www.|http://www."
pre_pattern_match = re.search(pattern, hostname)

if pre_pattern_match:
hostname = hostname[pre_pattern_match.end():]
post_pattern_match = re.search("/", hostname)
if post_pattern_match:
hostname = hostname[:post_pattern_match.start()]

return hostname
def main(url):
soup = BeautifulSoup(url, 'html.parser')
status = []
hostname = get_hostname_from_url(url)
status.append(having_ip_address(url))
status.append(url_length(url))
status.append(tiny_url(url))
status.append(having_at_symbol(url))
status.append(double_slash_redirecting(url))
status.append(prefix_suffix(hostname))
status.append(having_sub_domain(url))
status.append(https_token(url))
status.append(favicon(url, soup, hostname))
status.append(port(url))
status.append(https_in_url_domain(url))
status.append(request_url(url, soup, hostname))
status.append(url_of_anchor(url, soup, hostname))
status.append(links_in_tags(url, soup, hostname))
status.append(sfh(url, soup, hostname))
status.append(submitting_to_email(soup))
status.append(i_frame(soup))
return status

63 changes: 63 additions & 0 deletions run_alg.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
#!C:\Users\grred\AppData\Local\Programs\Python\Python36\python.exe
import sys
import time
from sklearn import datasets
from sklearn.feature_selection import VarianceThreshold
import re
import sklearn
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
import features_extraction as fe
import site
import pandas as pd
import numpy as np
site.addsitedir('C:/Users/grred/AppData/Local/Programs/Python/Python36/lib/site-packages')
print("Content-Type: text/html\n\r\n")
from flask import Flask
from flask import jsonify
from flask import request

app = Flask(__name__)

def random_forests(dataset,class_labels,test_size):

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
X = dataset
Y = class_labels
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size= test_size, random_state=42)

model = RandomForestClassifier(n_estimators = 5, criterion = 'entropy',random_state = 42)
model.fit(X_train,y_train)

return model

@app.route('/main', methods=['GET', 'POST'])
def main():
url = request.args.get('url')
print(f"url is {url}")
#url = "111111111111111111111111111111111111111111111111111111111111.com/"
dataset = pd.read_csv("C:/Users/grred/OneDrive/Desktop/datasets/modified_dataset.csv")
dataset = dataset.iloc[1:, :-1]
class_labels = dataset.iloc[:, -1:]
start_time = time.time()
model = random_forests(dataset,class_labels,0.3)
return_features = fe.main(url)
y_test = np.array(return_features)

end_time = time.time()
print("runtime = "+str(end_time - start_time)+" seconds")
result= {
'url' : url,
'feature_string' : ''.join([str(i) for i in return_features]) ,
'run_time' : str(end_time - start_time),
'result_label' : str(model.predict(y_test.reshape(1,-1))[0])
}
return jsonify(code = 0 , msg = result)
#return jsonify(code = 0 , msg = "feature string is: "+''.join([str(i) for i in return_features]) + ", run time is: "+str(end_time - start_time)+" and class variable is: "+str(model.predict(y_test.reshape(1,-1))[0])+" "+url)
if __name__ == '__main__':
app.run(port=5000, debug = True)

Loading