Skip to content

Commit e2af6ee

Browse files
committed
add ethos dataset
1 parent cf68e52 commit e2af6ee

File tree

7 files changed

+70
-2
lines changed

7 files changed

+70
-2
lines changed

.gitignore

+1
Original file line numberDiff line numberDiff line change
@@ -131,3 +131,4 @@ dmypy.json
131131
application.yml
132132
data/result
133133
application.yml
134+
data/cache

config.py

+3
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,9 @@ def result_file(self, file_name):
3030
def input_file(self, file_name):
3131
return join(self.data_dir('input'), file_name)
3232

33+
def cache_dir(self):
34+
return self.data_dir('cache')
35+
3336
def get_env(self, var):
3437
return self.__get_var(var)
3538

data_model/abstract_data.py

+18-2
Original file line numberDiff line numberDiff line change
@@ -5,18 +5,34 @@
55
import pandas as pd
66

77
from config import config
8+
from util.downloader import Downloader
89

910

1011
class AbstractData(ABC):
1112
def __init__(self):
12-
input_file = config.input_file(self.FILE_NAME)
13-
self.data = pd.read_csv(input_file)
13+
file = self._load_file()
14+
self.data = self._preprocess_data(file)
15+
16+
def _load_file(self):
17+
if self.FILE_URL:
18+
input_file = Downloader().download(self.FILE_NAME, self.FILE_URL)
19+
else:
20+
input_file = config.input_file(self.FILE_NAME)
21+
return input_file
22+
23+
def _preprocess_data(self, file):
24+
return pd.read_csv(file)
1425

1526
@property
1627
@abstractmethod
1728
def FILE_NAME(self):
1829
pass
1930

31+
@property
32+
@abstractmethod
33+
def FILE_URL(self):
34+
pass
35+
2036
@abstractmethod
2137
def get_data(self):
2238
pass

data_model/ethos_data.py

+18
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
import pandas as pd
2+
3+
from config import config
4+
from data_model.abstract_data import AbstractData
5+
6+
7+
class EthosData(AbstractData):
8+
FILE_NAME = 'Ethos_Dataset_Binary.csv'
9+
FILE_URL = 'https://raw.githubusercontent.com/intelligence-csd-auth-gr/Ethos-Hate-Speech-Dataset/master/ethos/ethos_data/Ethos_Dataset_Binary.csv'
10+
11+
def _preprocess_data(self, file):
12+
return pd.read_csv(file, sep=';')
13+
14+
def get_data(self):
15+
return self.data['comment']
16+
17+
def get_label(self):
18+
return self.data['isHate']

main.py

+11
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22

33
from config import config
44
from data_analysis import word_cloud_generator
5+
from data_model.ethos_data import EthosData
56
from data_model.south_park_data import SouthParkData
67
from data_model.twitter_data import TwitterData
78

@@ -19,3 +20,13 @@
1920
all_text = ' '.join(data[labels == 1])
2021
print(word_cloud_generator.generate(all_text, 'twitter_word_cloud_label_1.png'))
2122

23+
data = EthosData().get_data()
24+
labels = EthosData().get_label()
25+
26+
all_text = ' '.join(data[labels == 0])
27+
word_cloud_generator.generate(all_text, 'ethos_word_cloud_label_0.png')
28+
29+
all_text = ' '.join(data[labels == 1])
30+
word_cloud_generator.generate(all_text, 'ethos_word_cloud_label_1.png')
31+
32+

util/cache_handler.py

Whitespace-only changes.

util/downloader.py

+19
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
import os
2+
3+
from config import config
4+
from util.logger import log
5+
6+
7+
class Downloader:
8+
@staticmethod
9+
def download(file_name, url, zip=False):
10+
cache_dr = config.cache_dir()
11+
local_path = os.path.join(cache_dr, file_name)
12+
if not os.path.exists(local_path):
13+
import dload
14+
if zip:
15+
dload.save_unzip(url, cache_dr)
16+
else:
17+
dload.save(url, local_path)
18+
log.info(f"Downloading finished: {url}")
19+
return local_path

0 commit comments

Comments
 (0)