Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

build scrapper passed #73

Closed
wants to merge 14 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 9 additions & 6 deletions package.json
Original file line number Diff line number Diff line change
Expand Up @@ -6,13 +6,16 @@
"start": "node ./bin/www"
},
"dependencies": {
"express": "4.17.1",
"static-favicon": "1.0.2",
"morgan": "1.9.1",
"cookie-parser": "1.4.4",
"body-parser": "1.19.0",
"ejs": "2.7.4",
"cheerio": "^1.0.0-rc.3",
"cookie-parser": "1.4.4",
"debug": "4.1.1",
"jade": "1.11.0"
"ejs": "2.7.4",
"express": "4.17.1",
"jade": "1.11.0",
"moment": "^2.24.0",
"morgan": "1.9.1",
"request": "^2.88.0",
"static-favicon": "1.0.2"
}
}
6 changes: 5 additions & 1 deletion public/css/style.min.css
Original file line number Diff line number Diff line change
Expand Up @@ -9190,7 +9190,11 @@ textarea {
display: block;
height: 400px;
width: 100%;
overflow: hidden;

/* Project cards rotate animation */
perspective: 250rem;
-moz-perspective: 250rem;
-webkit-perspective: 250rem;
}
.project-item:hover .project-item-card-front {
-webkit-transform: rotateY(180deg);
Expand Down
25 changes: 22 additions & 3 deletions routes/index.js
Original file line number Diff line number Diff line change
@@ -1,10 +1,29 @@
var express = require('express');
var router = express.Router();
const express = require('express');
const router = express.Router();
const moment = require("moment")
const json = require('./data.json')

// need 4 elements from the json array in blog.ejs file
let ejsRenderedData = []

for (let i=0; i<=3; i++) {

let date = moment(json[i].release)

ejsRenderedData.push({
title: json[i].title,
href: json[i].href,
author: json[i].author,
release: date.format("DD/MM/YYYY"),
cover: json[i].cover
})
}

/* GET home page. */
router.get('/', function(req, res) {
console.log("check")
res.render('index');
res.render('index', { ejsRenderedData: ejsRenderedData });
});


module.exports = router;
14 changes: 14 additions & 0 deletions routes/json/cover.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
[
{
"img": "https://miro.medium.com/max/2380/1*pvH4Qw7wmBFu2meLGU4HXg.jpeg"
},
{
"img": "https://miro.medium.com/max/1460/1*evXUXDtSpBZo_HHfXt4CNA.jpeg"
},
{
"img": "https://miro.medium.com/max/442/1*-5x0MC7k53mV4VMqKiKHOg.png"
},
{
"img": "https://miro.medium.com/max/2560/1*5yVOKthzCe2Ko5zoCgrLkQ.jpeg"
}
]
68 changes: 68 additions & 0 deletions routes/json/data.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
[
{
"title": "Frequently asked Interview Questions in react-redux",
"href": "https://blog.zairza.in/frequently-asked-interview-questions-in-react-redux-bc774733b449?source=collection_home---5------0-----------------------",
"author": "Jayashree Panda",
"release": "2019-11-22T08:11:26.615Z"
},
{
"title": "OAuth using MEVN Stack",
"href": "https://blog.zairza.in/oauth-using-mevn-stack-4b4a383dae08?source=collection_home---6------0-----------------------",
"author": "Ramakrishna Pattnaik",
"release": "2019-08-25T12:13:49.122Z"
},
{
"title": "A Guide to Dual Booting",
"href": "https://blog.zairza.in/a-guide-to-dual-booting-d9dffe042ee6?source=collection_home---6------1-----------------------",
"author": "Dikshant Brahma",
"release": "2019-08-10T06:01:01.253Z"
},
{
"title": "Home Automation : Making Our Lives Easier",
"href": "https://blog.zairza.in/home-automation-making-our-lives-easier-a782ee067ea8?source=collection_home---6------2-----------------------",
"author": "Subhangi Choudhary",
"release": "2019-08-06T10:19:07.530Z"
},
{
"title": "What is “BITCOIN” and How its Mining Process Works…",
"href": "https://blog.zairza.in/https-medium-com-pruthwirajnayak-what-is-bitcoin-and-how-its-mining-process-works-8cf5aa51d87f?source=collection_home---6------3-----------------------",
"author": "Pruthwiraj Nayak",
"release": "2019-07-29T10:55:38.837Z"
},
{
"title": "Modern Radios — Miles of Range and Years of Battery Life?",
"href": "https://blog.zairza.in/modern-radios-miles-of-range-and-years-of-battery-life-6ca1a690c4d9?source=collection_home---6------4-----------------------",
"author": "Sanjanamohapatra 99",
"release": "2019-07-14T09:29:38.928Z"
},
{
"title": "Why ROS?",
"href": "https://blog.zairza.in/why-ros-18221b2fe6a?source=collection_home---6------5-----------------------",
"author": "Abhishek Mishra",
"release": "2019-06-26T17:40:00.150Z"
},
{
"title": "5G: Is the future really near ?",
"href": "https://blog.zairza.in/5g-is-the-future-really-near-f194b95ef94e?source=collection_home---6------6-----------------------",
"author": "Aditya Bisoi",
"release": "2019-06-21T11:31:01.210Z"
},
{
"title": "Human Area Network (HAN) : Turning Humans Into Swipe Card?",
"href": "https://blog.zairza.in/human-area-network-han-turning-humans-into-swipe-card-7d670b9ee0cb?source=collection_home---6------7-----------------------",
"author": "Sanjanamohapatra 99",
"release": "2019-06-16T14:35:57.071Z"
},
{
"title": "Security Features of Blockchain and How it Works",
"href": "https://blog.zairza.in/security-features-of-blockchain-and-how-it-works-2870490d3461?source=collection_home---6------8-----------------------",
"author": "Pruthwiraj Nayak",
"release": "2019-06-07T19:19:52.142Z"
},
{
"title": "Industrial Automation For Enabling Industry 4.0",
"href": "https://blog.zairza.in/industrial-automation-for-enabling-industry-4-0-b6144898adb8?source=collection_home---6------9-----------------------",
"author": "Sarthak Kar",
"release": "2019-06-01T16:07:34.086Z"
}
]
88 changes: 88 additions & 0 deletions routes/scrapcover.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,88 @@
// This file is reponsible for scraping the cover image from each individual blogs.accordion

// TODO: scraper for cover image from individual blog post

const express = require("express");
const fs = require("fs");
const cheerio = require("cheerio")
const request = require("request")
const json = require("./json/data.json")

const app = express();

let count = 0
let finalFourUrl = []
let bloglinks = []

// This brings first four urls from data.json to bloglinks (array)

for (let i = 0; i <= 3; i++) {

let coverlink = json[i].href
bloglinks.push(coverlink)

}


app.get("/scrapcover", function (req, res) {

while (count < 4) {

// set url for the cover image
let coverURL = json[count].href
console.log(coverURL)

let coverData = []

request(coverURL, function (error, response, html) {

if (!error) {

let $ = cheerio.load(html)

// Declaring variables to scrape
let coverLink

const Cover = function (cover) {
this.cover = cover
}

$(".paragraph-image").closest("figure").filter(function () {

let element = $(this).find("noscript").html()
let imgsrc = $(element).attr("src")

coverData.push(imgsrc)

})

newCover = new Cover(coverData[0])

}

finalFourUrl.unshift({
"img": newCover.cover
})


fs.writeFile(__dirname + "/../routes/json/cover.json", JSON.stringify(finalFourUrl, null, 4), function (err) {
if (err) {
console.log(err)
}
})

})

count++
// while loop ends
}


})


app.listen("3000", function () {
console.log("Magic happens on port 3000")
})


103 changes: 103 additions & 0 deletions routes/scraper.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,103 @@
// The scraper for the blog.ejs section in the application.
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Where do you run this file?

// The source for scraping is: https://blog.zairza.in/

// Things we need from the website:
// clickable cover image with blog link
// date published in DD/MM/YYYY format
// title of the blog post
// author of the blog post

const express = require("express");
const fs = require("fs");
const cheerio = require("cheerio");
const request = require("request");

const app = express();


app.get("/scrape", function (req, res) {

// set the main page url
url = "https://blog.zairza.in/"

// scraped data is stored here
let data = []

// make request to the to be scraped website through request
request(url, function (error, response, html) {
// check if any errors

if (!error) {
// whole scraping code will be inside this block

let $ = cheerio.load(html);

// Declare variables to capture:
// TODO: SCRAPE: covers, releases, titles, authors

let title, href, author, release;

// make constructor function blog
const Blog = function (title, href, author, release, cover) {
this.title = title
this.href = href
this.author = author
this.release = release
}

$(".postArticle--short").filter(function () {

let i = 1

while (i <= 1) {
title = $(this).find("h3").text()
href = $(this).find(".u-clearfix").next("a").attr("href")
author = $(this).find(".postMetaInline-authorLockup").children().first().text()
release = $(this).find("time").attr("datetime")

const newBlog = new Blog(title, href, author, release)

data.push({
"title": newBlog.title,
"href": newBlog.href,
"author": newBlog.author,
"release": newBlog.release
})

// continuation of loop
i += 1
}
})

// This scrapes the one latest blog post
$(".u-paddingTop30").filter(function () {

title = $(this).find("h3").first().text()
href = $(this).find(".u-borderLighter").attr("href")
author = $(this).find(".u-flexCenter").children().last().children().first().text()
release = $(this).find("time").attr("datetime")

data.unshift({
"title": title,
"href": href,
"author": author,
"release": release
})
})

// Add data array into a json file
fs.writeFile(__dirname + "/../routes/json/data.json", JSON.stringify(data, null, 4), function (err) {
if (err) {
console.log(err)
}
});
}


})

});

app.listen(3000, function () {
console.log("Magic happens at port 3000")
});
Loading