netarchivesuite · VictorHarbo · Dec 17, 2024 · Dec 16, 2024 · Dec 16, 2024 · Dec 17, 2024
diff --git a/CHANGES.md b/CHANGES.md
@@ -4,8 +4,11 @@ UNRELEASED
 -----
 Upgraded solr dependencies from v9.1.0 to v9.4.1
 HTML pages with geo tag will no longer we found in image GEO search.
+Fixed Gephi export regression bug, not all results was extracted due to Gephi also was limit by CSV export limit size in property file.
+Added SolrWayback ASCII logo in log file when started successfully.
 Add support for Memento API, including timegates and timemaps. Memento properties added to solrwayback.properties (Thanks @VictorHarbo ) 
 
+
 5.1.2
 -----
 Bug fix. Chunking was not removed in all cases. This was only relevant for WARC-files that are created with chunking. (not Heritrix) 

diff --git a/src/main/java/dk/kb/netarchivesuite/solrwayback/facade/Facade.java b/src/main/java/dk/kb/netarchivesuite/solrwayback/facade/Facade.java
@@ -566,9 +566,25 @@ public static InputStream exportWarcStreaming(boolean expandResources, boolean e
         return new StreamingSolrWarcExportBufferedInputStream(solrDocs, max, gzip); // Use maximum export results from property-file
     }
 
+    /**
+     * <p>
+     * Query will have filter: content_type_norm:html AND links_domains:* AND url_type:slashpage"
+     * </p>
+     * <p>
+     * The same domains will appear many time in the solr result set, but only first one will be
+     * added the the csv file. The extraction uses a HashMap to remember what domains has been added.
+     * </p>
+     * <p>
+     *  The 100M limit of solr documents will most likely result in a final CSV file with  less than 1M documents.
+     *  And this should be enough since Gephi can not handle more than 1M nodes.
+     *  Split extraction into crawl_year if the 100M limit is not enough.    
+     * </p>
+     *  @param q The query
+     * 
+     */
     public static InputStream exportLinkGraphStreaming(String q) {
         SolrStreamingLinkGraphCSVExportClient solr = SolrStreamingLinkGraphCSVExportClient.createExporter(null, q);
-        return new StreamingSolrExportBufferedInputStream(solr, 1000000); // 1 MIL
+        return new StreamingSolrExportBufferedInputStream(solr, 100000000);  //100M limit, the CSV streaming extractor needs a limit.
     }
 
     /**

diff --git a/src/main/java/dk/kb/netarchivesuite/solrwayback/listeners/InitializationContextListener.java b/src/main/java/dk/kb/netarchivesuite/solrwayback/listeners/InitializationContextListener.java
@@ -87,6 +87,7 @@ public void contextInitialized(ServletContextEvent event) {
               log.info("Using default warc-file-resolver implementation");
             }
 
+            log.info(SolrWaybackAsciiLogo.SOLRWAYBACK_LOGO); //Add nice logo when started successfully.            
             log.info("solrwayback version " + version + " started successfully");
 
         } catch (Exception e) {

diff --git a/src/main/java/dk/kb/netarchivesuite/solrwayback/listeners/SolrWaybackAsciiLogo.java b/src/main/java/dk/kb/netarchivesuite/solrwayback/listeners/SolrWaybackAsciiLogo.java
@@ -0,0 +1,21 @@
+package dk.kb.netarchivesuite.solrwayback.listeners;
+
+public class SolrWaybackAsciiLogo {
+
+    //Some characters are escaped, looks fine when printed.
+    public final static String SOLRWAYBACK_LOGO=
+"\n"
++ "             _______.  ______    __      .______     ____    __    ____  ___   ____    ____ .______        ___       ______  __  ___ \n"
++ "            /       | /  __  \\  |  |     |   _  \\    \\   \\  /  \\  /   / /   \\  \\   \\  /   / |   _  \\      /   \\     /      ||  |/  / \n"
++ "           |   (----`|  |  |  | |  |     |  |_)  |    \\   \\/    \\/   / /  ^  \\  \\   \\/   /  |  |_)  |    /  ^  \\   |  ,----'|  '  /  \n"
++ "            \\   \\    |  |  |  | |  |     |      /      \\            / /  /_\\  \\  \\_    _/   |   _  <    /  /_\\  \\  |  |     |    <   \n"
++ "        .----)   |   |  `--'  | |  `----.|  |\\  \\----.  \\    /\\    / /  _____  \\   |  |     |  |_)  |  /  _____  \\ |  `----.|  .  \\  \n"
++ "        |_______/     \\______/  |_______|| _| `._____|   \\__/  \\__/ /__/     \\__\\  |__|     |______/  /__/     \\__\\ \\______||__|\\__\\"
++ "\n";
+
+
+
+    public static void main(String[] args) {
+        System.out.println(SOLRWAYBACK_LOGO);
+      }
+}