Skip to content

Commit d1dcacc

Browse files
arjitcodesaudiodude
authored andcommitted
refactor: Use project name for URL stripping in Simple Selection instead of netloc
1 parent 321eda7 commit d1dcacc

File tree

2 files changed

+12
-42
lines changed

2 files changed

+12
-42
lines changed

wp1/selection/models/simple.py

+8-21
Original file line numberDiff line numberDiff line change
@@ -69,28 +69,15 @@ def validate(self, **params):
6969
invalid_article_names.append(decoded_item)
7070
is_valid = False
7171
if is_valid:
72-
article_name = self.extract_article_name(decoded_item)
73-
if not article_name:
74-
invalid_article_names.append(decoded_item)
75-
else:
76-
valid_article_names.append(article_name)
72+
project_domain = 'en.wikipedia.org'
73+
if 'project' in params:
74+
project_domain = params['project']
75+
wiki_prefix = f"https://{project_domain}/wiki/"
76+
index_prefix = f"https://{project_domain}/w/index.php?title="
77+
article_name = decoded_item.replace(wiki_prefix,
78+
"").replace(index_prefix, "")
79+
valid_article_names.append(article_name)
7780
if forbidden_chars:
7881
errors.append('The list contained the following invalid characters: ' +
7982
', '.join(forbidden_chars))
8083
return (valid_article_names, invalid_article_names, errors)
81-
82-
def extract_article_name(self, url):
83-
if not url:
84-
return None
85-
86-
parsed_url = urllib.parse.urlparse(url)
87-
domain = parsed_url.netloc
88-
if not domain:
89-
return url
90-
if not domain.endswith("wikipedia.org"):
91-
return None
92-
93-
wiki_prefix = f"https://{domain}/wiki/"
94-
index_prefix = f"https://{domain}/w/index.php?title="
95-
article_name = url.replace(wiki_prefix, "").replace(index_prefix, "")
96-
return article_name

wp1/selection/models/simple_test.py

+4-21
Original file line numberDiff line numberDiff line change
@@ -196,31 +196,14 @@ def test_validate_max_size(self):
196196

197197
def test_non_en_url_stripping(self):
198198
simple_builder_test = SimpleBuilder()
199-
expected = (['Statue_of_Liberty', 'Indonésie', 'Marie_Curie'], [], [])
199+
expected = (['Portugal', 'Europa', 'Marie_Curie'], [], [])
200200
params = {
201201
'list': [
202-
'https://en.wikipedia.org/wiki/Statue_of_Liberty',
203-
'https://fr.wikipedia.org/wiki/Indon%C3%A9sie',
202+
'https://de.wikipedia.org/wiki/Portugal',
203+
'https://de.wikipedia.org/wiki/Europa',
204204
'https://de.wikipedia.org/w/index.php?title=Marie_Curie'
205205
],
206-
'project': 'project_name'
207-
}
208-
actual = simple_builder_test.validate(**params)
209-
self.assertEqual(expected, actual)
210-
211-
def test_non_wikipedia_domain_invalid(self):
212-
simple_builder_test = SimpleBuilder()
213-
expected = (['Marie_Curie'], [
214-
'https://en.wiki.org/wiki/Statue_of_Liberty',
215-
'https://fr.wiktionary.org/wiki/Indonésie',
216-
], [])
217-
params = {
218-
'list': [
219-
'https://en.wiki.org/wiki/Statue_of_Liberty',
220-
'https://fr.wiktionary.org/wiki/Indon%C3%A9sie',
221-
'https://de.wikipedia.org/w/index.php?title=Marie_Curie'
222-
],
223-
'project': 'project_name'
206+
'project': 'de.wikipedia.org'
224207
}
225208
actual = simple_builder_test.validate(**params)
226209
self.assertEqual(expected, actual)

0 commit comments

Comments
 (0)