Skip to content

Commit

Permalink
fix ImageExtractorNew
Browse files Browse the repository at this point in the history
  • Loading branch information
jlareck committed Jan 27, 2022
1 parent 5b984fc commit ebc6c61
Showing 1 changed file with 65 additions and 72 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -47,29 +47,19 @@ class ImageExtractorNew(
private val coatOfArmsProperty = context.ontology.properties("foaf:depiction") //TODO
private val mainImageProperty = context.ontology.properties("foaf:depiction") //TODO
private val flagProperty = context.ontology.properties("foaf:depiction") //TODO

private val rdfType = context.ontology.properties("rdf:type")

override val datasets = Set(DBpediaDatasets.Images)

private var mainImageFound: Boolean = false

private var flagImage: ArrayBuffer[Option[(String, Node)]] = ArrayBuffer()
private var coatOfArmsImage: ArrayBuffer[Option[(String, Node)]] = ArrayBuffer()
private var mapImage: ArrayBuffer[Option[(String, Node)]] = ArrayBuffer()
private var signatureImage: ArrayBuffer[Option[(String, Node)]] = ArrayBuffer()
private var mainImage: Option[(String, Node)] = None
private var counter = 0
private var imageCount = 0

override def extract(node: PageNode, subjectUri: String): Seq[Quad] = {
var mainImageFound: Boolean = false
val flagImage: mutable.ArrayBuffer[Option[(String, Node)]] = ArrayBuffer()
val coatOfArmsImage: mutable.ArrayBuffer[Option[(String, Node)]] = ArrayBuffer()
val mapImage: mutable.ArrayBuffer[Option[(String, Node)]] = ArrayBuffer()
val signatureImage: mutable.ArrayBuffer[Option[(String, Node)]] = ArrayBuffer()
var mainImage: Option[(String, Node)] = None

mainImage = None
imageCount = 0
flagImage = ArrayBuffer[Option[(String, Node)]]()
coatOfArmsImage = ArrayBuffer[Option[(String, Node)]]()
signatureImage = ArrayBuffer[Option[(String, Node)]]()
mapImage = ArrayBuffer[Option[(String, Node)]]()
var isFirstImage = true

if (node.title.namespace != Namespace.Main) return Seq.empty

Expand All @@ -90,7 +80,10 @@ class ImageExtractorNew(
val thumbnailUrl = ExtractorUtils.getThumbnailURL(imageFileName, lang)

quads += new Quad(language, DBpediaDatasets.Images, subjectUri, foafDepictionProperty, url, sourceNode.sourceIri)
quads += new Quad(language, DBpediaDatasets.Images, subjectUri, dbpediaThumbnailProperty, thumbnailUrl, sourceNode.sourceIri)
if (isFirstImage) {
quads += new Quad(language, DBpediaDatasets.Images, subjectUri, dbpediaThumbnailProperty, thumbnailUrl, sourceNode.sourceIri)
isFirstImage = false
}
quads += new Quad(language, DBpediaDatasets.Images, url, foafThumbnailProperty, thumbnailUrl, sourceNode.sourceIri)
quads += new Quad(language, DBpediaDatasets.Images, url, rdfType, imageClass.uri, sourceNode.sourceIri)
quads += new Quad(language, DBpediaDatasets.Images, thumbnailUrl, rdfType, imageClass.uri, sourceNode.sourceIri)
Expand Down Expand Up @@ -134,6 +127,58 @@ class ImageExtractorNew(
quads += new Quad(language, DBpediaDatasets.Images, subjectUri, mapProperty, url, img._2.sourceIri,null)
}))

def imageSearch(nodes: List[Node], depth: Int): Seq[Option[(String, Node)]] = {

var images = ArrayBuffer[Option[(String, Node)]]()

// Match every node for TextNode, InternalLinkNode & InterWikiLinkNode
// for other types of node => recursive search for these types in their children

nodes.foreach {
case node@TextNode(_, _, _) =>
// toWikiText is used instead of toPlainText because some images would get lost.
totallyNotImageLinkRegex(node.toWikiText).foreach(file => {
images += processImageLink(file, node)
})
//times += "ImageLinkRegex Finish: " -> System.currentTimeMillis()
case node@InternalLinkNode(_, _, _, _) =>
totallyNotImageLinkRegex(node.toWikiText).foreach(file => {
images += processImageLink(file, node)
})
case node@InterWikiLinkNode(_, _, _, _) =>
totallyNotImageLinkRegex(node.toWikiText).foreach(file => {
images += processImageLink(file, node)
})
case node =>
if (depth < recursionDepth) images ++= imageSearch(node.children, depth + 1)
}
images
}
def processImageLink(fileName: String, node: Node): Option[(String, Node)] = {
// Encoding
var encodedFileName = fileName
if (encodedLinkRegex.findFirstIn(fileName).isEmpty)
encodedFileName = UriUtils.iriDecode(WikiUtil.wikiEncode(fileName))

val result = if (!context.nonFreeImages.contains(fileName))
Some((encodedFileName, node))
else
None

// --------------- Special Images ---------------
val special = totallyNotSpecialImageRegex(encodedFileName)
if (special == "flag") flagImage += result
else if (special == "map") mapImage += result
else if (special == "coa") coatOfArmsImage += result
else if (special == "signature") signatureImage += result
if (!mainImageFound) {
// First Image will be defined as main Image
mainImage = result
mainImageFound = true
}
result
}

quads
}

Expand All @@ -142,36 +187,7 @@ class ImageExtractorNew(
* @param nodes List of nodes that will be searched
* @return List of filename and origin node
*/
private def imageSearch(nodes: List[Node], depth: Int): Seq[Option[(String, Node)]] = {

var images = ArrayBuffer[Option[(String, Node)]]()

// Match every node for TextNode, InternalLinkNode & InterWikiLinkNode
// for other types of node => recursive search for these types in their children

nodes.foreach {
case node@TextNode(_, _, _) =>
// toWikiText is used instead of toPlainText because some images would get lost.
totallyNotImageLinkRegex(node.toWikiText).foreach(file => {
images += processImageLink(file, node)
imageCount += 1
})
//times += "ImageLinkRegex Finish: " -> System.currentTimeMillis()
case node@InternalLinkNode(_, _, _, _) =>
totallyNotImageLinkRegex(node.toWikiText).foreach(file => {
images += processImageLink(file, node)
imageCount += 1
})
case node@InterWikiLinkNode(_, _, _, _) =>
totallyNotImageLinkRegex(node.toWikiText).foreach(file => {
images += processImageLink(file, node)
imageCount += 1
})
case node =>
if (depth < recursionDepth) images ++= imageSearch(node.children, depth + 1)
}
images
}


/**
* Ensures Encoding and Copyright
Expand All @@ -180,30 +196,7 @@ class ImageExtractorNew(
* @param node Source-Node of the fileName
* @return Encoded fileName of an Non-Non-Free Image
*/
private def processImageLink(fileName: String, node: Node): Option[(String, Node)] = {
// Encoding
var encodedFileName = fileName
if (encodedLinkRegex.findFirstIn(fileName).isEmpty)
encodedFileName = UriUtils.iriDecode(WikiUtil.wikiEncode(fileName))

val result = if (!context.nonFreeImages.contains(fileName))
Some((encodedFileName, node))
else
None

// --------------- Special Images ---------------
val special = totallyNotSpecialImageRegex(encodedFileName)
if (special == "flag") flagImage += result
else if (special == "map") mapImage += result
else if (special == "coa") coatOfArmsImage += result
else if (special == "signature") signatureImage += result
if (!mainImageFound) {
// First Image will be defined as main Image
mainImage = result
mainImageFound = true
}
result
}


/**
* We needed a faster way to find images in text, than regex
Expand Down

0 comments on commit ebc6c61

Please sign in to comment.