From dc24b4e5c4a2d4e2dfb95237ad888d94c6f47def Mon Sep 17 00:00:00 2001 From: Leonardo Cardoso Date: Thu, 23 Sep 2021 20:28:49 +0200 Subject: [PATCH] Capture baseURL --- CHANGELOG.md | 1 + README.md | 21 +++---- Sources/Regex.swift | 5 +- Sources/Response.swift | 3 +- Sources/ResponseExtension.swift | 6 ++ Sources/SwiftLinkPreview.swift | 54 +++++++++++++++--- SwiftLinkPreview.xcodeproj/project.pbxproj | 44 ++++++++++----- SwiftLinkPreviewTests/BaseURLTests.swift | 55 +++++++++++++++++++ SwiftLinkPreviewTests/Constants.swift | 1 + SwiftLinkPreviewTests/head-meta-base.html | 10 ++++ SwiftLinkPreviewTests/head-meta-facebook.html | 2 +- SwiftLinkPreviewTests/head-meta-meta.html | 2 +- 12 files changed, 166 insertions(+), 38 deletions(-) create mode 100644 SwiftLinkPreviewTests/BaseURLTests.swift create mode 100644 SwiftLinkPreviewTests/head-meta-base.html diff --git a/CHANGELOG.md b/CHANGELOG.md index f0699d7..807d6a4 100755 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -31,6 +31,7 @@ - Updated regex limit [#148](https://github.com/LeonardoCardoso/SwiftLinkPreview/issues/148) - Changed by [kinhvodoi92](https://github.com/kinhvodoi92) - Annotated `Cancelable.cancel()` as `@objc` to make it compatibale with Objective-C [#135](https://github.com/LeonardoCardoso/SwiftLinkPreview/issues/135) +- Capture base URL [#45](https://github.com/LeonardoCardoso/SwiftLinkPreview/issues/45) - Changed by [LeonardoCardoso](https://github.com/LeonardoCardoso) diff --git a/README.md b/README.md index e5f1435..c2e23ca 100644 --- a/README.md +++ b/README.md @@ -121,16 +121,17 @@ let preview = slp.preview("Text containing URL", ```swift Response { - let url: URL // URL - let finalUrl: URL // unshortened URL - let canonicalUrl: String // canonical URL - let title: String // title - let description: String // page description or relevant text - let images: [String] // array of URLs of the images - let image: String // main image - let icon: String // favicon - let video: String // video - let price: String // price + let baseURL: String? // base + let url: URL? // URL + let finalUrl: URL? // unshortened URL + let canonicalUrl: String? // canonical URL + let title: String? // title + let description: String? // page description or relevant text + let images: [String]? // array of URLs of the images + let image: String? // main image + let icon: String? // favicon + let video: String? // video + let price: String? // price } ``` diff --git a/Sources/Regex.swift b/Sources/Regex.swift index 843db2b..06c1738 100644 --- a/Sources/Regex.swift +++ b/Sources/Regex.swift @@ -17,8 +17,9 @@ class Regex { static let imageTagPattern = "" static let secondaryImageTagPattern = "og:image\"(.+?)content=\"([^\"](.+?))\"(.+?)[/]?>" static let titlePattern = "(.*?)" - static let metatagPattern = "" - static let metatagContentPattern = "content=(\"(.*?)\")|('(.*?)')" + static let metaTagPattern = "" + static let baseTagPattern = "" + static let metaTagContentPattern = "content=(\"(.*?)\")|('(.*?)')" static let cannonicalUrlPattern = "([^\\+&#@%\\?=~_\\|!:,;]+)" static let rawTagPattern = "<[^>]+>" static let inlineStylePattern = "(.*?)" diff --git a/Sources/Response.swift b/Sources/Response.swift index d20f657..c3ef506 100644 --- a/Sources/Response.swift +++ b/Sources/Response.swift @@ -9,7 +9,8 @@ import Foundation public struct Response { - + + public internal(set) var baseURL: String? public internal(set) var url: URL? public internal(set) var finalUrl: URL? public internal(set) var canonicalUrl: String? diff --git a/Sources/ResponseExtension.swift b/Sources/ResponseExtension.swift index d3dd9b3..169748b 100644 --- a/Sources/ResponseExtension.swift +++ b/Sources/ResponseExtension.swift @@ -12,6 +12,7 @@ internal extension Response { var dictionary: [String: Any] { var responseData:[String: Any] = [:] + responseData["baseURL"] = baseURL responseData["url"] = url responseData["finalUrl"] = finalUrl responseData["canonicalUrl"] = canonicalUrl @@ -35,11 +36,14 @@ internal extension Response { case images case icon case video + case baseURL case price } mutating func set(_ value: Any, for key: Key) { switch key { + case Key.baseURL: + if let value = value as? String { self.baseURL = value } case Key.url: if let value = value as? URL { self.url = value } case Key.finalUrl: @@ -65,6 +69,8 @@ internal extension Response { func value(for key: Key) -> Any? { switch key { + case Key.baseURL: + return self.baseURL case Key.url: return self.url case Key.finalUrl: diff --git a/Sources/SwiftLinkPreview.swift b/Sources/SwiftLinkPreview.swift index 86f2f36..1ac52cd 100644 --- a/Sources/SwiftLinkPreview.swift +++ b/Sources/SwiftLinkPreview.swift @@ -132,10 +132,11 @@ open class SwiftLinkPreview: NSObject { result.title = $0.title result.description = $0.description - result.image = $0.image - result.images = $0.images - result.icon = $0.icon - result.video = $0.video + + result.image = self.formatImageURL($0.image, base: $0.baseURL) + result.images = self.formatImageURLs($0.images, base: $0.baseURL) + result.icon = self.formatImageURL($0.icon, base: $0.baseURL) + result.video = self.formatImageURL($0.video, base: $0.baseURL) result.price = $0.price self.cache.slp_setCachedResponse(url: unshortened.absoluteString, response: result) @@ -154,6 +155,28 @@ open class SwiftLinkPreview: NSObject { return cancellable } + private func formatImageURL(_ url: String?, base: String?) -> String? { + guard var url = url else { return nil } + + if !url.starts(with: "http"), let base = base { + url = "\(base)\(url)" + } + + return url + } + + func formatImageURLs(_ array: [String]?, base: String?) -> [String]? { + guard var array = array else { return nil } + + for i in 0 ..< array.count { + if let formatted = formatImageURL(array[0], base: base) { + array[i] = formatted + } + } + + return Array(Set(array)) + } + /* Extract url redirection inside the GET query. Like https://www.dji.com/404?url=http%3A%2F%2Fwww.dji.com%2Fmatrice600-pro%2Finfo#specs -> http://www.dji.com/de/matrice600-pro/info#specs @@ -287,9 +310,9 @@ extension SwiftLinkPreview { CFStringConvertIANACharSetNameToEncoding( $0 as CFString ) ) ) } ?? .utf8 if let html = String( data: data, encoding: encoding ) { - for meta in Regex.pregMatchAll( html, regex: Regex.metatagPattern, index: 1 ) { + for meta in Regex.pregMatchAll( html, regex: Regex.metaTagPattern, index: 1 ) { if (meta.contains( "http-equiv=\"refresh\"" ) || meta.contains( "http-equiv='refresh'" )), - let value = Regex.pregMatchFirst( meta, regex: Regex.metatagContentPattern, index: 2 )?.decoded.extendedTrim, + let value = Regex.pregMatchFirst( meta, regex: Regex.metaTagContentPattern, index: 2 )?.decoded.extendedTrim, let redirectString = value.split( separator: ";" ) .first( where: { $0.lowercased().starts( with: "url=" ) } )? .split( separator: "=", maxSplits: 1 ).last, @@ -444,6 +467,8 @@ extension SwiftLinkPreview { result = self.crawlMetaTags(sanitizedHtmlCode, result: result) + result = self.crawlMetaBase(sanitizedHtmlCode, result: result) + var otherResponse = self.crawlTitle(sanitizedHtmlCode, result: result) otherResponse = self.crawlDescription(otherResponse.htmlCode, result: otherResponse.result) @@ -534,10 +559,10 @@ extension SwiftLinkPreview { Response.Key.title.rawValue, Response.Key.description.rawValue, Response.Key.image.rawValue, - Response.Key.video.rawValue, + Response.Key.video.rawValue ] - let metatags = Regex.pregMatchAll(htmlCode, regex: Regex.metatagPattern, index: 1) + let metatags = Regex.pregMatchAll(htmlCode, regex: Regex.metaTagPattern, index: 1) for metatag in metatags { for tag in possibleTags { @@ -552,7 +577,7 @@ extension SwiftLinkPreview { if let key = Response.Key(rawValue: tag), result.value(for: key) == nil { - if let value = Regex.pregMatchFirst(metatag, regex: Regex.metatagContentPattern, index: 2) { + if let value = Regex.pregMatchFirst(metatag, regex: Regex.metaTagContentPattern, index: 2) { let value = value.decoded.extendedTrim if tag == "image" { let value = addImagePrefixIfNeeded(value, result: result) @@ -572,6 +597,17 @@ extension SwiftLinkPreview { return result } + internal func crawlMetaBase(_ htmlCode: String, result: Response) -> Response { + + var result = result + + if let base = Regex.pregMatchAll(htmlCode, regex: Regex.baseTagPattern, index: 2).first { + result.set(base, for: .baseURL) + } + + return result + } + // Crawl for title if needed internal func crawlTitle(_ htmlCode: String, result: Response) -> (htmlCode: String, result: Response) { var result = result diff --git a/SwiftLinkPreview.xcodeproj/project.pbxproj b/SwiftLinkPreview.xcodeproj/project.pbxproj index 403e874..3ee57ac 100644 --- a/SwiftLinkPreview.xcodeproj/project.pbxproj +++ b/SwiftLinkPreview.xcodeproj/project.pbxproj @@ -8,6 +8,12 @@ /* Begin PBXBuildFile section */ 1F8164ED26287866000F2905 /* VideoTests.swift in Sources */ = {isa = PBXBuildFile; fileRef = 1F8164EC26287866000F2905 /* VideoTests.swift */; }; + 27BCC85826FCF22E00886BDA /* BaseURLTests.swift in Sources */ = {isa = PBXBuildFile; fileRef = 27BCC85726FCF22E00886BDA /* BaseURLTests.swift */; }; + 27BCC85D26FCF3BF00886BDA /* BaseURLTests.swift in Sources */ = {isa = PBXBuildFile; fileRef = 27BCC85726FCF22E00886BDA /* BaseURLTests.swift */; }; + 27BCC85E26FCF3C000886BDA /* BaseURLTests.swift in Sources */ = {isa = PBXBuildFile; fileRef = 27BCC85726FCF22E00886BDA /* BaseURLTests.swift */; }; + 27BCC86026FCF4C000886BDA /* head-meta-base.html in Resources */ = {isa = PBXBuildFile; fileRef = 27BCC85F26FCF4C000886BDA /* head-meta-base.html */; }; + 27BCC86126FCF4C000886BDA /* head-meta-base.html in Resources */ = {isa = PBXBuildFile; fileRef = 27BCC85F26FCF4C000886BDA /* head-meta-base.html */; }; + 27BCC86226FCF4C000886BDA /* head-meta-base.html in Resources */ = {isa = PBXBuildFile; fileRef = 27BCC85F26FCF4C000886BDA /* head-meta-base.html */; }; 68074FFA1F23B6C900649DE6 /* head-meta-icon.html in Resources */ = {isa = PBXBuildFile; fileRef = 68074FF91F23B6C900649DE6 /* head-meta-icon.html */; }; 68074FFB1F23BB1100649DE6 /* head-meta-icon.html in Resources */ = {isa = PBXBuildFile; fileRef = 68074FF91F23B6C900649DE6 /* head-meta-icon.html */; }; 68074FFC1F23BB1400649DE6 /* head-meta-icon.html in Resources */ = {isa = PBXBuildFile; fileRef = 68074FF91F23B6C900649DE6 /* head-meta-icon.html */; }; @@ -150,6 +156,8 @@ /* Begin PBXFileReference section */ 1F8164EC26287866000F2905 /* VideoTests.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = VideoTests.swift; sourceTree = ""; }; + 27BCC85726FCF22E00886BDA /* BaseURLTests.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = BaseURLTests.swift; sourceTree = ""; }; + 27BCC85F26FCF4C000886BDA /* head-meta-base.html */ = {isa = PBXFileReference; lastKnownFileType = text.html; path = "head-meta-base.html"; sourceTree = ""; }; 68074FF91F23B6C900649DE6 /* head-meta-icon.html */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = text.html; path = "head-meta-icon.html"; sourceTree = ""; }; 686E58DE1F22416D000C2A33 /* IconTests.swift */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.swift; path = IconTests.swift; sourceTree = ""; }; 7A552DE121A460910019E8B1 /* Response.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = Response.swift; sourceTree = ""; }; @@ -257,17 +265,18 @@ 985DCEB01D2BFD2700B40D76 /* Files */ = { isa = PBXGroup; children = ( - 98B5ED421D3E5F5C00AEBD54 /* head-meta-itemprop.html */, - 986D5BE61D33E0FD0025555F /* head-title.html */, - 985DCEBB1D2BFFAF00B40D76 /* body-text-span.html */, - 985DCEBC1D2BFFAF00B40D76 /* body-text-p.html */, - 985DCEBD1D2BFFAF00B40D76 /* body-text-div.html */, - 985DCEB71D2BFE4100B40D76 /* body-image-single.html */, 985DCEB81D2BFE4100B40D76 /* body-image-gallery.html */, - 985DCEB11D2BFD3400B40D76 /* head-meta-twitter.html */, - 985DCEB21D2BFD3400B40D76 /* head-meta-meta.html */, + 985DCEB71D2BFE4100B40D76 /* body-image-single.html */, + 985DCEBD1D2BFFAF00B40D76 /* body-text-div.html */, + 985DCEBC1D2BFFAF00B40D76 /* body-text-p.html */, + 985DCEBB1D2BFFAF00B40D76 /* body-text-span.html */, + 27BCC85F26FCF4C000886BDA /* head-meta-base.html */, 985DCEB31D2BFD3400B40D76 /* head-meta-facebook.html */, 68074FF91F23B6C900649DE6 /* head-meta-icon.html */, + 98B5ED421D3E5F5C00AEBD54 /* head-meta-itemprop.html */, + 985DCEB21D2BFD3400B40D76 /* head-meta-meta.html */, + 985DCEB11D2BFD3400B40D76 /* head-meta-twitter.html */, + 986D5BE61D33E0FD0025555F /* head-title.html */, ); name = Files; sourceTree = ""; @@ -358,18 +367,19 @@ 98DC53391D1D73DB001134E3 /* SwiftLinkPreviewTests */ = { isa = PBXGroup; children = ( - 98B5ED491D3E7DC600AEBD54 /* HugeTests.swift */, + 27BCC85726FCF22E00886BDA /* BaseURLTests.swift */, 985DCEC81D2C029700B40D76 /* BodyTests.swift */, + 988B48D61D2C3C2E0040A4AD /* Constants */, + 985DCEB01D2BFD2700B40D76 /* Files */, + 98B5ED491D3E7DC600AEBD54 /* HugeTests.swift */, + 686E58DE1F22416D000C2A33 /* IconTests.swift */, 985DCEC61D2C026000B40D76 /* ImageTests.swift */, - 1F8164EC26287866000F2905 /* VideoTests.swift */, + 98E7C3121D3B23F5009E5F6D /* Info */, 985DCEC41D2C022E00B40D76 /* MetaTests.swift */, 982812911D3A9293000D3ABB /* RegexTests.swift */, 986D5BE41D33DFE50025555F /* TitleTests.swift */, - 686E58DE1F22416D000C2A33 /* IconTests.swift */, - 988B48D61D2C3C2E0040A4AD /* Constants */, - 985DCEB01D2BFD2700B40D76 /* Files */, - 98E7C3121D3B23F5009E5F6D /* Info */, 988B48D11D2C39790040A4AD /* Utils */, + 1F8164EC26287866000F2905 /* VideoTests.swift */, ); path = SwiftLinkPreviewTests; sourceTree = ""; @@ -641,6 +651,7 @@ 985DCEBF1D2BFFAF00B40D76 /* body-text-span.html in Resources */, 985DCEC01D2BFFAF00B40D76 /* body-text-p.html in Resources */, 985DCEC11D2BFFAF00B40D76 /* body-text-div.html in Resources */, + 27BCC86026FCF4C000886BDA /* head-meta-base.html in Resources */, 985DCEB91D2BFE4100B40D76 /* body-image-single.html in Resources */, 985DCEB61D2BFD3400B40D76 /* head-meta-facebook.html in Resources */, 986D5BE71D33E0FD0025555F /* head-title.html in Resources */, @@ -666,6 +677,7 @@ 98E7C32F1D3B24DA009E5F6D /* body-image-single.html in Resources */, 98B5ED461D3E62A200AEBD54 /* head-meta-itemprop.html in Resources */, 98E7C3301D3B24DA009E5F6D /* body-image-gallery.html in Resources */, + 27BCC86226FCF4C000886BDA /* head-meta-base.html in Resources */, 98E7C3311D3B24DA009E5F6D /* head-meta-twitter.html in Resources */, 98E7C3321D3B24DA009E5F6D /* head-meta-meta.html in Resources */, 98E7C3331D3B24DA009E5F6D /* head-meta-facebook.html in Resources */, @@ -691,6 +703,7 @@ 98F76D1D1D3AF87100E9B10E /* body-image-single.html in Resources */, 98B5ED441D3E62A000AEBD54 /* head-meta-itemprop.html in Resources */, 98F76D1E1D3AF87100E9B10E /* body-image-gallery.html in Resources */, + 27BCC86126FCF4C000886BDA /* head-meta-base.html in Resources */, 98F76D1F1D3AF87100E9B10E /* head-meta-twitter.html in Resources */, 98F76D201D3AF87100E9B10E /* head-meta-meta.html in Resources */, 98F76D211D3AF87100E9B10E /* head-meta-facebook.html in Resources */, @@ -740,6 +753,7 @@ 986D5BE51D33DFE60025555F /* TitleTests.swift in Sources */, 985DCEC71D2C026000B40D76 /* ImageTests.swift in Sources */, 988B48D81D2C3C3D0040A4AD /* Constants.swift in Sources */, + 27BCC85826FCF22E00886BDA /* BaseURLTests.swift in Sources */, 9272A10D1E2EF0E600F9F17E /* Regex.swift in Sources */, ); runOnlyForDeploymentPostprocessing = 0; @@ -777,6 +791,7 @@ 98E7C3281D3B24C6009E5F6D /* File.swift in Sources */, 98E7C3291D3B24C6009E5F6D /* IntExtension.swift in Sources */, 9272A10F1E2EF0E800F9F17E /* Regex.swift in Sources */, + 27BCC85E26FCF3C000886BDA /* BaseURLTests.swift in Sources */, ); runOnlyForDeploymentPostprocessing = 0; }; @@ -813,6 +828,7 @@ 98F76D121D3AF78600E9B10E /* File.swift in Sources */, 98F76D131D3AF78600E9B10E /* IntExtension.swift in Sources */, 9272A10E1E2EF0E700F9F17E /* Regex.swift in Sources */, + 27BCC85D26FCF3BF00886BDA /* BaseURLTests.swift in Sources */, ); runOnlyForDeploymentPostprocessing = 0; }; diff --git a/SwiftLinkPreviewTests/BaseURLTests.swift b/SwiftLinkPreviewTests/BaseURLTests.swift new file mode 100644 index 0000000..33ee379 --- /dev/null +++ b/SwiftLinkPreviewTests/BaseURLTests.swift @@ -0,0 +1,55 @@ +// +// BaseURLTests.swift +// SwiftLinkPreviewTests +// +// Created by Leonardo Cardoso on 23.09.21. +// Copyright © 2021 leocardz.com. All rights reserved. +// + +import XCTest +@testable import SwiftLinkPreview + +// This class tests head meta info +class BaseURLTests: XCTestCase { + + // MARK: - Vars + var baseTemplate = "" + let slp = SwiftLinkPreview() + + // MARK: - SetUps + // Those setup functions get that template, and fulfil determinated areas with rand texts, images and tags + override func setUp() { + super.setUp() + + self.baseTemplate = File.toString(Constants.headMetaBase) + + } + + // MARK: - Base + func setUpBaseAndRun() { + + var baseTemplate = self.baseTemplate + baseTemplate = baseTemplate.replace(Constants.headRandom, with: String.randomTag()) + baseTemplate = baseTemplate.replace(Constants.bodyRandom, with: String.randomTag()).extendedTrim + + let result = self.slp.crawlMetaBase(baseTemplate, result: Response()) + + XCTAssertEqual(result.baseURL, "https://host/resource/index/") + } + + func testBase() { + + for _ in 0 ..< 100 { + + self.setUpBaseAndRun() + + } + + } + + func testResultBase() { + XCTAssertEqual(slp.formatImageURLs(["assets/test.png"], base: "https://host/resource/index/")?.first, + "https://host/resource/index/assets/test.png") + } + +} diff --git a/SwiftLinkPreviewTests/Constants.swift b/SwiftLinkPreviewTests/Constants.swift index db15838..e04a9fb 100644 --- a/SwiftLinkPreviewTests/Constants.swift +++ b/SwiftLinkPreviewTests/Constants.swift @@ -20,6 +20,7 @@ struct Constants { static let bodyIcon = "head-meta-icon" static let headMetaTwitter = "head-meta-twitter" static let headMetaMeta = "head-meta-meta" + static let headMetaBase = "head-meta-base" static let headMetaItemprop = "head-meta-itemprop" static let headMetaFacebook = "head-meta-facebook" static let headTitle = "head-title" diff --git a/SwiftLinkPreviewTests/head-meta-base.html b/SwiftLinkPreviewTests/head-meta-base.html new file mode 100644 index 0000000..7f66076 --- /dev/null +++ b/SwiftLinkPreviewTests/head-meta-base.html @@ -0,0 +1,10 @@ + + + [:head-random] + + + + [:body-random] + + + diff --git a/SwiftLinkPreviewTests/head-meta-facebook.html b/SwiftLinkPreviewTests/head-meta-facebook.html index 7d1b9d9..79e01b1 100644 --- a/SwiftLinkPreviewTests/head-meta-facebook.html +++ b/SwiftLinkPreviewTests/head-meta-facebook.html @@ -10,4 +10,4 @@ [:body-random] - \ No newline at end of file + diff --git a/SwiftLinkPreviewTests/head-meta-meta.html b/SwiftLinkPreviewTests/head-meta-meta.html index a6575a3..cb09599 100644 --- a/SwiftLinkPreviewTests/head-meta-meta.html +++ b/SwiftLinkPreviewTests/head-meta-meta.html @@ -10,4 +10,4 @@ [:body-random] - \ No newline at end of file +