-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #42 from jisung-in/feature/34-crawling-best-book
[Feature] 베스트 셀러 크롤링 기능 추가
- Loading branch information
Showing
16 changed files
with
235 additions
and
50 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,7 +1,10 @@ | ||
package com.jisungin.infra.crawler; | ||
|
||
import java.util.Map; | ||
|
||
public interface Crawler { | ||
|
||
CrawlingBook crawlBook(String isbn); | ||
Map<Long, CrawlingBook> crawlBestSellerBook(); | ||
|
||
} |
37 changes: 32 additions & 5 deletions
37
src/main/java/com/jisungin/infra/crawler/CrawlingBook.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,29 +1,56 @@ | ||
package com.jisungin.infra.crawler; | ||
|
||
import java.time.LocalDateTime; | ||
import lombok.Builder; | ||
import lombok.Getter; | ||
import lombok.ToString; | ||
|
||
@Getter | ||
@ToString | ||
public class CrawlingBook { | ||
|
||
private String imageUrl; | ||
private String title; | ||
private String content; | ||
private String isbn; | ||
private String publisher; | ||
private String imageUrl; | ||
private String thumbnail; | ||
private String[] authors; | ||
private LocalDateTime dateTime; | ||
|
||
@Builder | ||
private CrawlingBook(String imageUrl, String content) { | ||
this.imageUrl = imageUrl; | ||
private CrawlingBook(String title, String content, String isbn, String publisher, String imageUrl, String thumbnail, | ||
String authors, LocalDateTime dateTime) { | ||
this.title = title; | ||
this.content = content; | ||
this.isbn = isbn; | ||
this.publisher = publisher; | ||
this.imageUrl = imageUrl; | ||
this.thumbnail = thumbnail; | ||
this.authors = parseAuthorsToArr(authors); | ||
this.dateTime = dateTime; | ||
} | ||
|
||
public static CrawlingBook of(String imageUrl, String content) { | ||
public static CrawlingBook of(String title, String content, String isbn, String publisher, String imageUrl, | ||
String thumbnail, String authors, LocalDateTime dateTime) { | ||
return CrawlingBook.builder() | ||
.imageUrl(imageUrl) | ||
.title(title) | ||
.content(content) | ||
.isbn(isbn) | ||
.publisher(publisher) | ||
.imageUrl(imageUrl) | ||
.thumbnail(thumbnail) | ||
.authors(authors) | ||
.dateTime(dateTime) | ||
.build(); | ||
} | ||
|
||
public boolean isBlankContent() { | ||
return this.content.isBlank(); | ||
} | ||
|
||
private String[] parseAuthorsToArr(String authors) { | ||
return authors.split(" 저| 공저| 글| 편저| 원저")[0].split(","); | ||
} | ||
|
||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,10 +1,12 @@ | ||
package com.jisungin.infra.crawler; | ||
|
||
import java.util.Map; | ||
import org.jsoup.nodes.Document; | ||
|
||
public interface Parser { | ||
|
||
String parseIsbn(Document doc); | ||
CrawlingBook parseBook(Document doc); | ||
Map<Long, String> parseBestSellerBookId(Document doc); | ||
|
||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
23 changes: 0 additions & 23 deletions
23
src/main/java/com/jisungin/infra/crawler/Yes24CrawlerConstant.java
This file was deleted.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
69 changes: 59 additions & 10 deletions
69
src/main/java/com/jisungin/infra/crawler/Yes24Parser.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,29 +1,78 @@ | ||
package com.jisungin.infra.crawler; | ||
|
||
import static com.jisungin.infra.crawler.Yes24CrawlerConstant.BOOK_CONTENT_CSS; | ||
import static com.jisungin.infra.crawler.Yes24CrawlerConstant.BOOK_IMAGE_ATTR; | ||
import static com.jisungin.infra.crawler.Yes24CrawlerConstant.BOOK_IMAGE_CSS; | ||
import static com.jisungin.infra.crawler.Yes24CrawlerConstant.ISBN_ATTR; | ||
import static com.jisungin.infra.crawler.Yes24CrawlerConstant.ISBN_CSS; | ||
|
||
import com.jayway.jsonpath.JsonPath; | ||
import java.time.LocalDate; | ||
import java.time.LocalDateTime; | ||
import java.util.List; | ||
import java.util.Map; | ||
import java.util.stream.Collectors; | ||
import java.util.stream.IntStream; | ||
import lombok.Setter; | ||
import org.jsoup.Jsoup; | ||
import org.jsoup.nodes.Document; | ||
import org.jsoup.nodes.Element; | ||
import org.jsoup.safety.Safelist; | ||
import org.jsoup.select.Elements; | ||
import org.springframework.boot.context.properties.ConfigurationProperties; | ||
import org.springframework.stereotype.Component; | ||
|
||
@Component | ||
@Setter | ||
@ConfigurationProperties(prefix = "crawler.yes24.parser") | ||
public class Yes24Parser implements Parser { | ||
|
||
private String isbnCss; | ||
private String isbnAttr; | ||
private String bookContentCss; | ||
private String bookJsonCss; | ||
private String bestRankingCss; | ||
private String bestIdCss; | ||
private String bestIdAttrs; | ||
|
||
@Override | ||
public String parseIsbn(Document doc) { | ||
return doc.select(ISBN_CSS).attr(ISBN_ATTR); | ||
return doc.select(isbnCss).attr(isbnAttr); | ||
} | ||
|
||
@Override | ||
public CrawlingBook parseBook(Document doc) { | ||
String image = doc.select(BOOK_IMAGE_CSS).attr(BOOK_IMAGE_ATTR); | ||
String content = Jsoup.clean(doc.select(BOOK_CONTENT_CSS).text(), Safelist.none()); | ||
String json = doc.select(bookJsonCss).html(); | ||
|
||
String title = parseJsonToString(json, "$.name"); | ||
String isbn = parseJsonToString(json, "$.workExample[0].isbn"); | ||
String imageUrl = parseJsonToString(json, "$.image"); | ||
String publisher = parseJsonToString(json, "$.publisher.name"); | ||
String authors = parseJsonToString(json, "$.author.name"); | ||
String thumbnail = imageUrl.replace("XL", "M"); | ||
String content = Jsoup.clean(doc.select(bookContentCss).text(), Safelist.none()); | ||
LocalDateTime dateTime = parseDate(parseJsonToString(json, "$.workExample[0].datePublished")); | ||
|
||
return CrawlingBook.of(title, content, isbn, publisher, imageUrl, thumbnail, authors, dateTime); | ||
} | ||
|
||
@Override | ||
public Map<Long, String> parseBestSellerBookId(Document doc) { | ||
Elements rankings = doc.select(bestRankingCss); | ||
List<String> bookIds = doc.select(bestIdCss) | ||
.eachAttr(bestIdAttrs); | ||
|
||
return IntStream.range(0, rankings.size()) | ||
.boxed() | ||
.collect(Collectors.toMap( | ||
i -> parseRanking(rankings.get(i)), | ||
bookIds::get)); | ||
} | ||
|
||
private Long parseRanking(Element rankingElement) { | ||
return Long.parseLong(rankingElement.text()); | ||
} | ||
|
||
private String parseJsonToString(String json, String path) { | ||
return JsonPath.read(json, path); | ||
} | ||
|
||
return CrawlingBook.of(image, content); | ||
private LocalDateTime parseDate(String dateString) { | ||
return LocalDate.parse(dateString).atStartOfDay(); | ||
} | ||
|
||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -10,4 +10,5 @@ spring: | |
prod-env: | ||
- prod | ||
include: | ||
oauth | ||
- oauth | ||
- crawler |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.