From 341632290eab9257995bf3ccf2c6da1ff26fb049 Mon Sep 17 00:00:00 2001 From: TakenPt Date: Sun, 25 Feb 2024 03:57:54 +0900 Subject: [PATCH 1/5] add downloading --- Epub/KoeBook.Epub/ScrapingNarou.cs | 198 +++++++++++++++++++++++++++++ 1 file changed, 198 insertions(+) create mode 100644 Epub/KoeBook.Epub/ScrapingNarou.cs diff --git a/Epub/KoeBook.Epub/ScrapingNarou.cs b/Epub/KoeBook.Epub/ScrapingNarou.cs new file mode 100644 index 0000000..efc5718 --- /dev/null +++ b/Epub/KoeBook.Epub/ScrapingNarou.cs @@ -0,0 +1,198 @@ +using AngleSharp; +using AngleSharp.Dom; +using AngleSharp.Html.Dom; +using AngleSharp.Io; +using KoeBook.Epub.Service; +using System.IO; +using System.Net.Http.Json; +using static KoeBook.Epub.ScrapingHelper; + +namespace KoeBook.Epub +{ + public partial class ScrapingNarouService : IScrapingService + { + public ScrapingNarouService(IHttpClientFactory httpClientFactory) + { + _httpCliantFactory = httpClientFactory; + } + + private readonly IHttpClientFactory _httpCliantFactory; + + public async Task ScrapingAsync(string url, string coverFilePath, string imageDirectory, Guid id, CancellationToken ct) + { + var config = Configuration.Default.WithDefaultLoader(); + using var context = BrowsingContext.New(config); + var doc = await context.OpenAsync(url, ct).ConfigureAwait(false); + + // title の取得 + var bookTitle = doc.QuerySelector(".novel_title"); + if (bookTitle is null) + { + throw new EpubDocumentException($"Failed to get title properly.\nUrl may be not collect"); + } + + // auther の取得 + var bookAuther = doc.QuerySelector(".novel_writername a"); + if (bookAuther is null) + { + throw new EpubDocumentException($"Failed to get auther properly.\nUrl may be not collect"); + } + + bool isRensai = true; + int allNum = 0; + + var apiUrl = $"https://api.syosetu.com/novelapi/api/?of=ga-nt&ncode={UrlToNcode().Replace(url, "$1")}&out=json"; + + // APIを利用して、noveltype : 連載(1)か短編(2)か、general_all_no : 全掲載部分数 + var message = new HttpRequestMessage(System.Net.Http.HttpMethod.Get, apiUrl); + message.Headers.UserAgent.ParseAdd("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.45 Safari/537.36"); + var client = _httpCliantFactory.CreateClient(); + var result = await client.SendAsync(message, ct).ConfigureAwait(false); + var test = await result.Content.ReadAsStringAsync(ct).ConfigureAwait(false); + if (result.IsSuccessStatusCode) + { + var content = await result.Content.ReadFromJsonAsync(ct).ConfigureAwait(false); + if (content != null) + { + if (content[1].noveltype != null) + { + if (content[1].noveltype == 2) + { + isRensai = false; + } + } + else + { + throw new EpubDocumentException("faild to get data by Narou API"); + } + if (content[1].general_all_no != null) + { + allNum = (int)content[1].general_all_no!; + } + if (allNum == 0) + { + throw new EpubDocumentException("faild to get data by Narou API"); + } + } + } + else + { + throw new EpubDocumentException("Url may be not Correct"); + } + + if (isRensai) + { + //ReadPageAsync(url, imageDirectory, ct); + var temp = ReadPageAsync("https://ncode.syosetu.com/n1443bp/2/", imageDirectory, ct); + Console.WriteLine(temp.Result.Elements.Count); + } + else + { + ReadPageAsync(url, imageDirectory, ct); + } + + return new EpubDocument("", "", "", id); + + } + + public record BookInfo(int? allcount, int? noveltype, int? general_all_no); + + private async Task
ReadPageAsync(string url, string imageDirectory, CancellationToken ct) + { + var config = Configuration.Default.WithDefaultLoader(); + using var context = BrowsingContext.New(config); + var doc = await context.OpenAsync(url, ct).ConfigureAwait(false); + + var chapterTitle = doc.QuerySelector(".chapter_title"); + + var sectionTitle = doc.QuerySelector(".novel_subtitle"); + if (sectionTitle == null) + { + throw new EpubDocumentException("Can not find title of page"); + } + + var section = new Section(sectionTitle.InnerHtml); + + + var main_text = doc.QuerySelector("#novel_honbun"); + if (main_text != null) + { + foreach (var item in main_text.Children) + { + if (item is IHtmlParagraphElement) + { + if (item.ChildElementCount == 0) + { + if (!string.IsNullOrWhiteSpace(item.InnerHtml)) + { + section.Elements.Add(new Paragraph() { Text = item.InnerHtml }); + } + } + else if (item.ChildElementCount == 1) + { + if (item.Children[0] is IHtmlAnchorElement aElement) + { + if (aElement.ChildElementCount == 1) + { + if (aElement.Children[0] is IHtmlImageElement img) + { + if (img.Source != null) + { + // 画像のダウンロード + var loader = context.GetService(); + if (loader != null) + { + var downloading = loader.FetchAsync(new DocumentRequest(new Url(img.Source))); + ct.Register(() => downloading.Cancel()); + var response = await downloading.Task.ConfigureAwait(false); + using var ms = new MemoryStream(); + await response.Content.CopyToAsync(ms, ct).ConfigureAwait(false); + var filePass = imageDirectory + FileUrlToFileName().Replace(img.Source, "$1"); + + File.WriteAllBytes(filePass, ms.ToArray()); + + } + } + else + { + throw new EpubDocumentException("Unexpected structure"); + } + } + } + else + { + throw new EpubDocumentException("Unexpected structure"); + } + } + else + { + + } + } + else + { + throw new EpubDocumentException("Unexpected structure"); + } + } + else + { + throw new EpubDocumentException("Unexpected structure"); + } + } + } + else + { + throw new EpubDocumentException("There is no honbun."); + } + return section; + } + + + + [System.Text.RegularExpressions.GeneratedRegex(@"https://.{5,7}.syosetu.com/(.{7}).?")] + private static partial System.Text.RegularExpressions.Regex UrlToNcode(); + + [System.Text.RegularExpressions.GeneratedRegex(@"http.{1,}/([^/]{0,}\.[^/]{1,})")] + private static partial System.Text.RegularExpressions.Regex FileUrlToFileName(); + } +} From c307468302beb0c240cbdf407031bd0fcbbb28f0 Mon Sep 17 00:00:00 2001 From: TakenPt Date: Sun, 25 Feb 2024 11:34:30 +0900 Subject: [PATCH 2/5] =?UTF-8?q?=E3=81=AA=E3=82=8D=E3=81=86=E3=81=AE?= =?UTF-8?q?=E3=82=B9=E3=82=AF=E3=83=AC=E3=82=A4=E3=83=94=E3=83=B3=E3=82=B0?= =?UTF-8?q?=E5=8B=95=E3=81=8F=E3=81=A8=E3=81=93=E3=81=BE=E3=81=A7?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- Epub/KoeBook.Epub/ScrapingNarou.cs | 131 ++++++++++++++++++++++++----- 1 file changed, 111 insertions(+), 20 deletions(-) diff --git a/Epub/KoeBook.Epub/ScrapingNarou.cs b/Epub/KoeBook.Epub/ScrapingNarou.cs index efc5718..58a4277 100644 --- a/Epub/KoeBook.Epub/ScrapingNarou.cs +++ b/Epub/KoeBook.Epub/ScrapingNarou.cs @@ -80,38 +80,105 @@ public async Task ScrapingAsync(string url, string coverFilePath, throw new EpubDocumentException("Url may be not Correct"); } + var document = new EpubDocument(bookTitle.InnerHtml, bookAuther.InnerHtml, coverFilePath, id); if (isRensai) { - //ReadPageAsync(url, imageDirectory, ct); - var temp = ReadPageAsync("https://ncode.syosetu.com/n1443bp/2/", imageDirectory, ct); - Console.WriteLine(temp.Result.Elements.Count); + List SectionWithChapterTitleList = new List(); + for (int i = 1; i <= allNum; i++) + { + Console.WriteLine(i); + await Task.Delay(500); + var pageUrl = System.IO.Path.Combine(url,i.ToString()); + var load = await ReadPageAsync(pageUrl,isRensai,imageDirectory, ct).ConfigureAwait(false); + SectionWithChapterTitleList.Add(load); + } + string? chapterTitle = null; + foreach (var sectionWithChapterTitle in SectionWithChapterTitleList) + { + if (sectionWithChapterTitle != null) + { + if (sectionWithChapterTitle.title != null) + { + if (sectionWithChapterTitle.title != chapterTitle) + { + chapterTitle = sectionWithChapterTitle.title; + document.Chapters.Add(new Chapter() { Title = chapterTitle }); + document.Chapters[^1].Sections.Add(sectionWithChapterTitle.section); + } + else + { + document.Chapters[^1].Sections.Add(sectionWithChapterTitle.section); + } + } + else + { + if (document.Chapters.Count == 0) + { + document.Chapters.Add(new Chapter()); + } + document.Chapters[^1].Sections.Add(sectionWithChapterTitle.section); + } + } + else + { + throw new EpubDocumentException("failed to get page"); + } + } } else { - ReadPageAsync(url, imageDirectory, ct); + var load = await ReadPageAsync(url, isRensai, imageDirectory, ct).ConfigureAwait(false); + if (load != null) + { + document.Chapters.Add(new Chapter() { Title = null }); + document.Chapters[^1].Sections.Add(load.section); + } } - - return new EpubDocument("", "", "", id); - + return document; } public record BookInfo(int? allcount, int? noveltype, int? general_all_no); - private async Task
ReadPageAsync(string url, string imageDirectory, CancellationToken ct) + private record SectionWithChapterTitle(string? title, Section section); + + private async Task ReadPageAsync(string url,bool isRensai, string imageDirectory, CancellationToken ct) { var config = Configuration.Default.WithDefaultLoader(); using var context = BrowsingContext.New(config); var doc = await context.OpenAsync(url, ct).ConfigureAwait(false); - var chapterTitle = doc.QuerySelector(".chapter_title"); + var chapterTitleElement = doc.QuerySelector(".chapter_title"); + string? chapterTitle = null; + if (chapterTitleElement != null) + { + if (chapterTitleElement.InnerHtml != null) + { + chapterTitle = chapterTitleElement.InnerHtml; + } + } + + IElement? sectionTitleElement = null; + if (isRensai) + { + sectionTitleElement = doc.QuerySelector(".novel_subtitle"); + } + else + { + sectionTitleElement = doc.QuerySelector(".novel_title"); + } - var sectionTitle = doc.QuerySelector(".novel_subtitle"); - if (sectionTitle == null) + string sectionTitle = ""; + if (sectionTitleElement == null) { throw new EpubDocumentException("Can not find title of page"); } + else + { + sectionTitle = sectionTitleElement.InnerHtml; + } - var section = new Section(sectionTitle.InnerHtml); + + var section = new Section(sectionTitleElement.InnerHtml); var main_text = doc.QuerySelector("#novel_honbun"); @@ -147,10 +214,9 @@ private async Task
ReadPageAsync(string url, string imageDirectory, Ca var response = await downloading.Task.ConfigureAwait(false); using var ms = new MemoryStream(); await response.Content.CopyToAsync(ms, ct).ConfigureAwait(false); - var filePass = imageDirectory + FileUrlToFileName().Replace(img.Source, "$1"); - + var filePass = System.IO.Path.Combine(imageDirectory, FileUrlToFileName().Replace(response.Address.Href, "$1")); File.WriteAllBytes(filePass, ms.ToArray()); - + section.Elements.Add(new Picture(filePass)); } } else @@ -164,14 +230,39 @@ private async Task
ReadPageAsync(string url, string imageDirectory, Ca throw new EpubDocumentException("Unexpected structure"); } } - else + else if (item.Children[0].TagName == "RUBY") { - + if (!string.IsNullOrWhiteSpace(item.InnerHtml)) + { + section.Elements.Add(new Paragraph() { Text = item.InnerHtml }); + } + } + else if (item.Children[0] is not IHtmlBreakRowElement) + { + throw new EpubDocumentException("Unexpected structure"); } } else { - throw new EpubDocumentException("Unexpected structure"); + bool isAllRuby = true; + foreach (var tags in item.Children) + { + if (tags.TagName != "RUBY") + { + isAllRuby = false; + } + } + if (isAllRuby) + { + if (!string.IsNullOrWhiteSpace(item.InnerHtml)) + { + section.Elements.Add(new Paragraph() { Text = item.InnerHtml }); + } + } + else + { + throw new EpubDocumentException("Unexpected structure"); + } } } else @@ -184,10 +275,10 @@ private async Task
ReadPageAsync(string url, string imageDirectory, Ca { throw new EpubDocumentException("There is no honbun."); } - return section; + return new SectionWithChapterTitle(chapterTitle, section); } - + [System.Text.RegularExpressions.GeneratedRegex(@"https://.{5,7}.syosetu.com/(.{7}).?")] private static partial System.Text.RegularExpressions.Regex UrlToNcode(); From 82f07d113f1ef2c3e2081b4d0952cbd5043ec049 Mon Sep 17 00:00:00 2001 From: TakenPt Date: Sun, 25 Feb 2024 11:40:13 +0900 Subject: [PATCH 3/5] =?UTF-8?q?=E3=80=8C=E3=80=8D=E3=81=A7=E3=81=AE?= =?UTF-8?q?=E5=8C=BA=E5=88=87=E3=82=8A=E3=82=92=E8=BF=BD=E5=8A=A0?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- Epub/KoeBook.Epub/ScrapingNarou.cs | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/Epub/KoeBook.Epub/ScrapingNarou.cs b/Epub/KoeBook.Epub/ScrapingNarou.cs index 58a4277..8b0d186 100644 --- a/Epub/KoeBook.Epub/ScrapingNarou.cs +++ b/Epub/KoeBook.Epub/ScrapingNarou.cs @@ -192,7 +192,10 @@ private async Task ReadPageAsync(string url,bool isRens { if (!string.IsNullOrWhiteSpace(item.InnerHtml)) { - section.Elements.Add(new Paragraph() { Text = item.InnerHtml }); + foreach (var split in SplitBrace(item.InnerHtml)) + { + section.Elements.Add(new Paragraph() { Text = split }); + } } } else if (item.ChildElementCount == 1) @@ -234,7 +237,10 @@ private async Task ReadPageAsync(string url,bool isRens { if (!string.IsNullOrWhiteSpace(item.InnerHtml)) { - section.Elements.Add(new Paragraph() { Text = item.InnerHtml }); + foreach (var split in SplitBrace(item.InnerHtml)) + { + section.Elements.Add(new Paragraph() { Text = split }); + } } } else if (item.Children[0] is not IHtmlBreakRowElement) @@ -256,7 +262,10 @@ private async Task ReadPageAsync(string url,bool isRens { if (!string.IsNullOrWhiteSpace(item.InnerHtml)) { - section.Elements.Add(new Paragraph() { Text = item.InnerHtml }); + foreach (var split in SplitBrace(item.InnerHtml)) + { + section.Elements.Add(new Paragraph() { Text = split }); + } } } else From d29c64dd35c9e89cff26490c366840519da7fc8c Mon Sep 17 00:00:00 2001 From: TakenPt Date: Sun, 25 Feb 2024 11:41:34 +0900 Subject: [PATCH 4/5] =?UTF-8?q?=E3=83=95=E3=82=A9=E3=83=BC=E3=83=9E?= =?UTF-8?q?=E3=83=83=E3=83=88?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- Epub/KoeBook.Epub/ScrapingNarou.cs | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/Epub/KoeBook.Epub/ScrapingNarou.cs b/Epub/KoeBook.Epub/ScrapingNarou.cs index 8b0d186..27ae164 100644 --- a/Epub/KoeBook.Epub/ScrapingNarou.cs +++ b/Epub/KoeBook.Epub/ScrapingNarou.cs @@ -88,8 +88,8 @@ public async Task ScrapingAsync(string url, string coverFilePath, { Console.WriteLine(i); await Task.Delay(500); - var pageUrl = System.IO.Path.Combine(url,i.ToString()); - var load = await ReadPageAsync(pageUrl,isRensai,imageDirectory, ct).ConfigureAwait(false); + var pageUrl = System.IO.Path.Combine(url, i.ToString()); + var load = await ReadPageAsync(pageUrl, isRensai, imageDirectory, ct).ConfigureAwait(false); SectionWithChapterTitleList.Add(load); } string? chapterTitle = null; @@ -141,7 +141,7 @@ public record BookInfo(int? allcount, int? noveltype, int? general_all_no); private record SectionWithChapterTitle(string? title, Section section); - private async Task ReadPageAsync(string url,bool isRensai, string imageDirectory, CancellationToken ct) + private async Task ReadPageAsync(string url, bool isRensai, string imageDirectory, CancellationToken ct) { var config = Configuration.Default.WithDefaultLoader(); using var context = BrowsingContext.New(config); @@ -265,7 +265,7 @@ private async Task ReadPageAsync(string url,bool isRens foreach (var split in SplitBrace(item.InnerHtml)) { section.Elements.Add(new Paragraph() { Text = split }); - } + } } } else From d8529dc224277d64f7a906916616856f9af3b6f7 Mon Sep 17 00:00:00 2001 From: TakenPt Date: Sun, 25 Feb 2024 11:44:05 +0900 Subject: [PATCH 5/5] =?UTF-8?q?url=E3=81=A7path.conbine=E3=82=92=E4=BD=BF?= =?UTF-8?q?=E7=94=A8?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- Epub/KoeBook.Epub/ScrapingAozora.cs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Epub/KoeBook.Epub/ScrapingAozora.cs b/Epub/KoeBook.Epub/ScrapingAozora.cs index 765afda..ce6406f 100644 --- a/Epub/KoeBook.Epub/ScrapingAozora.cs +++ b/Epub/KoeBook.Epub/ScrapingAozora.cs @@ -286,7 +286,7 @@ public async Task ScrapingAsync(string url, string coverFilePath, var response = await downloading.Task.ConfigureAwait(false); using var ms = new MemoryStream(); await response.Content.CopyToAsync(ms, ct).ConfigureAwait(false); - var filePass = imageDirectory + FileUrlToFileName().Replace(img.Source, "$1"); + var filePass = System.IO.Path.Combine(imageDirectory, FileUrlToFileName().Replace(img.Source, "$1")); File.WriteAllBytes(filePass, ms.ToArray()); checkSection(document, chapterNum); if (document.Chapters[chapterNum].Sections[sectionNum].Elements.Count > 1)