fix ImageExtractorNew

dbpedia · Jan 27, 2022 · ebc6c61 · ebc6c61
1 parent 5b984fc
commit ebc6c61
Showing 1 changed file with 65 additions and 72 deletions.
diff --git a/core/src/main/scala/org/dbpedia/extraction/mappings/ImageExtractorNew.scala b/core/src/main/scala/org/dbpedia/extraction/mappings/ImageExtractorNew.scala
@@ -47,29 +47,19 @@ class ImageExtractorNew(
   private val coatOfArmsProperty = context.ontology.properties("foaf:depiction") //TODO
   private val mainImageProperty = context.ontology.properties("foaf:depiction") //TODO
   private val flagProperty = context.ontology.properties("foaf:depiction") //TODO
-
   private val rdfType = context.ontology.properties("rdf:type")
 
   override val datasets = Set(DBpediaDatasets.Images)
 
-  private var mainImageFound: Boolean = false
-
-  private var flagImage: ArrayBuffer[Option[(String, Node)]] = ArrayBuffer()
-  private var coatOfArmsImage: ArrayBuffer[Option[(String, Node)]] = ArrayBuffer()
-  private var mapImage: ArrayBuffer[Option[(String, Node)]] = ArrayBuffer()
-  private var signatureImage: ArrayBuffer[Option[(String, Node)]] = ArrayBuffer()
-  private var mainImage: Option[(String, Node)] = None
-  private var counter = 0
-  private var imageCount = 0
-
   override def extract(node: PageNode, subjectUri: String): Seq[Quad] = {
+    var mainImageFound: Boolean = false
+    val flagImage: mutable.ArrayBuffer[Option[(String, Node)]] = ArrayBuffer()
+    val coatOfArmsImage: mutable.ArrayBuffer[Option[(String, Node)]] = ArrayBuffer()
+    val mapImage: mutable.ArrayBuffer[Option[(String, Node)]] = ArrayBuffer()
+    val signatureImage: mutable.ArrayBuffer[Option[(String, Node)]] = ArrayBuffer()
+    var mainImage: Option[(String, Node)] = None
 
-    mainImage = None
-    imageCount = 0
-    flagImage = ArrayBuffer[Option[(String, Node)]]()
-    coatOfArmsImage = ArrayBuffer[Option[(String, Node)]]()
-    signatureImage = ArrayBuffer[Option[(String, Node)]]()
-    mapImage = ArrayBuffer[Option[(String, Node)]]()
+    var isFirstImage = true
 
     if (node.title.namespace != Namespace.Main) return Seq.empty
 
@@ -90,7 +80,10 @@ class ImageExtractorNew(
           val thumbnailUrl = ExtractorUtils.getThumbnailURL(imageFileName, lang)
 
           quads += new Quad(language, DBpediaDatasets.Images, subjectUri, foafDepictionProperty, url, sourceNode.sourceIri)
-          quads += new Quad(language, DBpediaDatasets.Images, subjectUri, dbpediaThumbnailProperty, thumbnailUrl, sourceNode.sourceIri)
+          if (isFirstImage) {
+            quads += new Quad(language, DBpediaDatasets.Images, subjectUri, dbpediaThumbnailProperty, thumbnailUrl, sourceNode.sourceIri)
+            isFirstImage = false
+          }
           quads += new Quad(language, DBpediaDatasets.Images, url, foafThumbnailProperty, thumbnailUrl, sourceNode.sourceIri)
           quads += new Quad(language, DBpediaDatasets.Images, url, rdfType, imageClass.uri, sourceNode.sourceIri)
           quads += new Quad(language, DBpediaDatasets.Images, thumbnailUrl, rdfType, imageClass.uri, sourceNode.sourceIri)
@@ -134,6 +127,58 @@ class ImageExtractorNew(
       quads += new Quad(language, DBpediaDatasets.Images, subjectUri, mapProperty, url, img._2.sourceIri,null)
     }))
 
+    def imageSearch(nodes: List[Node], depth: Int): Seq[Option[(String, Node)]] = {
+
+      var images = ArrayBuffer[Option[(String, Node)]]()
+
+      // Match every node for TextNode, InternalLinkNode & InterWikiLinkNode
+      // for other types of node => recursive search for these types in their children
+
+      nodes.foreach {
+        case node@TextNode(_, _, _) =>
+          // toWikiText is used instead of toPlainText because some images would get lost.
+          totallyNotImageLinkRegex(node.toWikiText).foreach(file => {
+            images += processImageLink(file, node)
+          })
+        //times += "ImageLinkRegex Finish: " -> System.currentTimeMillis()
+        case node@InternalLinkNode(_, _, _, _) =>
+          totallyNotImageLinkRegex(node.toWikiText).foreach(file => {
+            images += processImageLink(file, node)
+          })
+        case node@InterWikiLinkNode(_, _, _, _) =>
+          totallyNotImageLinkRegex(node.toWikiText).foreach(file => {
+            images += processImageLink(file, node)
+          })
+        case node =>
+          if (depth < recursionDepth) images ++= imageSearch(node.children, depth + 1)
+      }
+      images
+    }
+    def processImageLink(fileName: String, node: Node): Option[(String, Node)] = {
+      // Encoding
+      var encodedFileName = fileName
+      if (encodedLinkRegex.findFirstIn(fileName).isEmpty)
+        encodedFileName = UriUtils.iriDecode(WikiUtil.wikiEncode(fileName))
+
+      val result = if (!context.nonFreeImages.contains(fileName))
+        Some((encodedFileName, node))
+      else
+        None
+
+      // --------------- Special Images ---------------
+      val special = totallyNotSpecialImageRegex(encodedFileName)
+      if (special == "flag") flagImage += result
+      else if (special == "map") mapImage += result
+      else if (special == "coa") coatOfArmsImage += result
+      else if (special == "signature") signatureImage += result
+      if (!mainImageFound) {
+        // First Image will be defined as main Image
+        mainImage = result
+        mainImageFound = true
+      }
+      result
+    }
+
     quads
   }
 
@@ -142,36 +187,7 @@ class ImageExtractorNew(
     * @param nodes List of nodes that will be searched
     * @return List of filename and origin node
     */
-  private def imageSearch(nodes: List[Node], depth: Int): Seq[Option[(String, Node)]] = {
-
-    var images = ArrayBuffer[Option[(String, Node)]]()
-
-    // Match every node for TextNode, InternalLinkNode & InterWikiLinkNode
-    // for other types of node => recursive search for these types in their children
-
-    nodes.foreach {
-      case node@TextNode(_, _, _) =>
-        // toWikiText is used instead of toPlainText because some images would get lost.
-        totallyNotImageLinkRegex(node.toWikiText).foreach(file => {
-          images += processImageLink(file, node)
-          imageCount += 1
-        })
-      //times += "ImageLinkRegex Finish: " -> System.currentTimeMillis()
-      case node@InternalLinkNode(_, _, _, _) =>
-        totallyNotImageLinkRegex(node.toWikiText).foreach(file => {
-          images += processImageLink(file, node)
-          imageCount += 1
-        })
-      case node@InterWikiLinkNode(_, _, _, _) =>
-        totallyNotImageLinkRegex(node.toWikiText).foreach(file => {
-          images += processImageLink(file, node)
-          imageCount += 1
-        })
-      case node =>
-        if (depth < recursionDepth) images ++= imageSearch(node.children, depth + 1)
-    }
-    images
-  }
+
 
   /**
     * Ensures Encoding and Copyright
@@ -180,30 +196,7 @@ class ImageExtractorNew(
     * @param node     Source-Node of the fileName
     * @return Encoded fileName of an Non-Non-Free Image
     */
-  private def processImageLink(fileName: String, node: Node): Option[(String, Node)] = {
-    // Encoding
-    var encodedFileName = fileName
-    if (encodedLinkRegex.findFirstIn(fileName).isEmpty)
-      encodedFileName = UriUtils.iriDecode(WikiUtil.wikiEncode(fileName))
-
-    val result = if (!context.nonFreeImages.contains(fileName))
-      Some((encodedFileName, node))
-    else
-      None
-
-    // --------------- Special Images ---------------
-    val special = totallyNotSpecialImageRegex(encodedFileName)
-    if (special == "flag") flagImage += result
-    else if (special == "map") mapImage += result
-    else if (special == "coa") coatOfArmsImage += result
-    else if (special == "signature") signatureImage += result
-    if (!mainImageFound) {
-      // First Image will be defined as main Image
-      mainImage = result
-      mainImageFound = true
-    }
-    result
-  }
+
 
   /**
     * We needed a faster way to find images in text, than regex