Skip to content

Commit 9a66c30

Browse files
author
Abhay-1552
committed
Wikipedia Scraping
BeautifulSoup and Flask Framework
1 parent cbc6235 commit 9a66c30

File tree

6 files changed

+198
-0
lines changed

6 files changed

+198
-0
lines changed

Wikipdedia/flask_rendering.py

+27
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
from flask import Flask, render_template, request
2+
import practice_beautifulsoap as data
3+
4+
app = Flask(__name__, template_folder='template')
5+
6+
7+
@app.route('/', methods=["GET", "POST"])
8+
def index():
9+
languages = data.lang()
10+
return render_template('index.html', languages=languages)
11+
12+
13+
@app.route("/display", methods=["POST"])
14+
def output():
15+
if request.method == "POST":
16+
entered_topic = request.form.get("topic")
17+
selected_language = request.form.get("language")
18+
19+
soup_data = data.data(entered_topic, selected_language)
20+
soup_image = data.get_image_urls(entered_topic)
21+
22+
return render_template('output.html', heading=entered_topic.upper(), data=soup_data,
23+
url=soup_image, language=selected_language)
24+
25+
26+
if __name__ == "__main__":
27+
app.run(debug=True)

Wikipdedia/main.py

+16
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
# This is a sample Python script.
2+
3+
# Press Shift+F10 to execute it or replace it with your code.
4+
# Press Double Shift to search everywhere for classes, files, tool windows, actions, and settings.
5+
6+
7+
def print_hi(name):
8+
# Use a breakpoint in the code line below to debug your script.
9+
print(f'Hi, {name}') # Press Ctrl+F8 to toggle the breakpoint.
10+
11+
12+
# Press the green button in the gutter to run the script.
13+
if __name__ == '__main__':
14+
print_hi('PyCharm')
15+
16+
# See PyCharm help at https://www.jetbrains.com/help/pycharm/

Wikipdedia/practice_beautifulsoap.py

+69
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,69 @@
1+
from bs4 import BeautifulSoup
2+
import requests
3+
4+
language_symbols = {}
5+
6+
7+
def lang():
8+
try:
9+
response = requests.get("https://www.wikipedia.org/")
10+
response.raise_for_status()
11+
soup = BeautifulSoup(response.content, 'html.parser')
12+
13+
for option in soup.find_all('option'):
14+
language = option.text
15+
symbol = option['lang']
16+
language_symbols[language] = symbol
17+
18+
return list(language_symbols.keys())
19+
20+
except requests.exceptions.RequestException as e:
21+
print("Error fetching language data:", e)
22+
return []
23+
24+
25+
def data(selected_topic, selected_language):
26+
symbol = language_symbols.get(selected_language)
27+
28+
try:
29+
url = f"https://{symbol}.wikipedia.org/wiki/{selected_topic}"
30+
data_response = requests.get(url)
31+
data_response.raise_for_status()
32+
data_soup = BeautifulSoup(data_response.content, 'html.parser')
33+
34+
main_content = data_soup.find('div', {'id': 'mw-content-text'})
35+
filtered_content = ""
36+
37+
if main_content:
38+
for element in main_content.descendants:
39+
if element.name in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']:
40+
filtered_content += "\n" + element.get_text(strip=True).upper() + "\n"
41+
42+
elif element.name == 'p':
43+
filtered_content += element.get_text(strip=True) + "\n"
44+
45+
return filtered_content
46+
47+
except requests.exceptions.RequestException as e:
48+
print("Error fetching Wikipedia content:", e)
49+
return "Error fetching data."
50+
51+
52+
def get_image_urls(query):
53+
try:
54+
search_url = f"https://www.google.com/search?q={query}&tbm=isch"
55+
image_response = requests.get(search_url)
56+
image_response.raise_for_status()
57+
image_soup = BeautifulSoup(image_response.content, 'html.parser')
58+
59+
image_urls = []
60+
for img in image_soup.find_all('img'):
61+
image_url = img.get('src')
62+
if image_url and image_url.startswith("http"):
63+
image_urls.append(image_url)
64+
65+
return image_urls[0]
66+
67+
except requests.exceptions.RequestException as e:
68+
print("Error fetching image URLs:", e)
69+
return None

Wikipdedia/static/js/output.js

+9
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
function validateForm() {
2+
var language = document.getElementById("language").value;
3+
4+
if (language === "Select") {
5+
alert("Please select a language.");
6+
return false;
7+
}
8+
}
9+

Wikipdedia/template/index.html

+42
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,42 @@
1+
<!DOCTYPE html>
2+
<html lang="en">
3+
4+
<head>
5+
<meta charset="UTF-8">
6+
<meta name="viewport" content="width=device-width, initial-scale=1.0">
7+
<title>Input Web Page</title>
8+
<link rel="stylesheet" href="https://maxcdn.bootstrapcdn.com/bootstrap/4.5.2/css/bootstrap.min.css">
9+
<script src="static/js/output.js"></script>
10+
</head>
11+
12+
<body>
13+
<div class="container text-center mt-4">
14+
<img src="https://www.wikipedia.org/portal/wikipedia.org/assets/img/[email protected]" height="150px" width="150px">
15+
<h1>Wikipedia</h1>
16+
</div>
17+
18+
<form action="/display" method="post" class="container mt-4" onsubmit="return validateForm()">
19+
<div class="form-group">
20+
<label for="topic">Topic Name:</label>
21+
<input type="text" class="form-control" name='topic' id="topic" placeholder="Enter Topic" required>
22+
</div>
23+
<div class="form-group">
24+
<label for="language">Select Language:</label>
25+
<select name="language" id="language" class="form-control" required>
26+
<option value="Select">Select</option>
27+
{% for language in languages %}
28+
<option value="{{ language }}">{{ language }}</option>
29+
{% endfor %}
30+
</select>
31+
</div>
32+
<input type="submit" value="Submit" class="btn btn-primary">
33+
<input type="reset" value="Clear" class="btn btn-danger">
34+
</form>
35+
36+
<script src="https://code.jquery.com/jquery-3.5.1.slim.min.js"></script>
37+
<script src="https://cdn.jsdelivr.net/npm/@popperjs/[email protected]/dist/umd/popper.min.js"></script>
38+
<script src="https://maxcdn.bootstrapcdn.com/bootstrap/4.5.2/js/bootstrap.min.js"></script>
39+
40+
</body>
41+
42+
</html>

Wikipdedia/template/output.html

+35
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,35 @@
1+
<!DOCTYPE html>
2+
<html lang="en">
3+
4+
<head>
5+
<meta charset="UTF-8">
6+
<meta name="viewport" content="width=device-width, initial-scale=1.0">
7+
<title>Output Web Page</title>
8+
<link rel="stylesheet" href="https://maxcdn.bootstrapcdn.com/bootstrap/4.5.2/css/bootstrap.min.css">
9+
<style>
10+
.container pre {
11+
white-space: pre-wrap;
12+
font-family: Arial, sans-serif;
13+
font-size: 17px;
14+
}
15+
</style>
16+
</head>
17+
18+
<body>
19+
<div class="container text-center mt-4">
20+
<img src="{{url}}" height="150px" width="150px">
21+
<h1> {{ heading }} </h1>
22+
<h6> in {{ language }} language</h6>
23+
</div>
24+
<div class="container mt-4">
25+
<div class="container border p-3">
26+
<pre>{{ data }}</pre>
27+
</div>
28+
</div>
29+
30+
<script src="https://code.jquery.com/jquery-3.5.1.slim.min.js"></script>
31+
<script src="https://cdn.jsdelivr.net/npm/@popperjs/[email protected]/dist/umd/popper.min.js"></script>
32+
<script src="https://maxcdn.bootstrapcdn.com/bootstrap/4.5.2/js/bootstrap.min.js"></script>
33+
</body>
34+
35+
</html>

0 commit comments

Comments
 (0)