YaleDHLab · pleonard212 · Jun 10, 2017 · Jun 10, 2017
diff --git a/yale_daily_news/requirements.txt b/yale_daily_news/requirements.txt
@@ -1,3 +1,5 @@
 numpy==1.11.3
 scipy==0.18.1
-scikit-image==0.12.3
+scikit-image==0.12.3
+pytesseract==0.1.7
+PIL==1.1.7
diff --git a/yale_daily_news/segment_ydn_images.py b/yale_daily_news/segment_ydn_images.py
@@ -6,6 +6,9 @@
 from shutil import Error, move, rmtree
 import numpy as np
 import glob, os, codecs, sys, json
+from PIL import Image
+import pytesseract
+
 
 '''
 ## Processing notes
@@ -456,12 +459,21 @@ def store_article_titles():
         # store the mapping from article path to first image path
         first_image_name = str(first_image['rect_id']) + '.png'
         path_to_first_image = os.path.join(article_path, first_image_name)
-        article_to_title[article_path] = path_to_first_image
+        best_guess_headline = pytesseract.image_to_string(Image.open('segmented_images' + path_to_first_image), lang='eng')
+        if best_guess_headline != '':
+        	best_guess_headline =  best_guess_headline.replace('-\n', '') # Fix hyphenation at column end
+ 	     	best_guess_headline =  best_guess_headline.replace('\n', ' ') # Put short lines together
+   	     	best_guess_headline = best_guess_headline[:100] # Take only the first part of the string (could be whole article!)
+        else:
+         	best_guess_headline = '[Untitled]' # If we got nothing, use placeholder.
+        print('Processing article: "' + best_guess_headline) 
+        article_to_title[article_path] = best_guess_headline
 
   with open('articles_to_titles.json', 'w') as out:
     json.dump(article_to_title, out)
 
 
+
 ##################
 # Segment Images #
 ##################
@@ -527,7 +539,14 @@ def segment_images(process_id):
           os.makedirs(out_path)
 
         io.imsave(out_path + str(rect_id) + '.png', cropped)
-
+        # This is an ugly hack to force 300dpi into the png metadata.
+        # Without explicit dpi, tesseract assumes low (70) dpi; this is sub-optimal.
+        try:
+        	Image.open(out_path + str(rect_id) + '.png').save(out_path + str(rect_id) + '.png',dpi=[300,300])
+        except:
+   		 	print("Couldn't write DPI to file " + out_path + str(rect_id) + ".png")
+
+
 
 def convert_coordinates(xml_coordinate_array, jp2_array, page):
   '''
@@ -656,7 +675,11 @@ def stack_segmented_images():
       os.makedirs(composite_path)
 
     io.imsave(os.path.join(composite_path, str(article_id) + '.png'), composite_image)
-
+    #ugly hack to get 300 dpi metadata which tesseract will need.
+    try:
+    	Image.open(os.path.join(composite_path, str(article_id) + '.png')).save(os.path.join(composite_path, str(article_id) + '.png'),dpi=[300,300])
+    except:
+    	print("Couldn't write DPI to file " + composite_path, str(article_id) + ".png")
 
 ##############
 # Main Block #
@@ -691,19 +714,19 @@ def stack_segmented_images():
       pass
 
   # Define the directory that contains subdirectories for each paper issue
-  root_data_directory = '/Users/doug/Desktop/ydn-sample/'
+  root_data_directory = '/media/dhlab/PG4T/ydn/fourissuesonly'
 
   # Define whether to run code in verbose mode
   verbosity_level = 1
 
   # Identify the maximum number of processors to use during analysis
-  n_processes = 4
+  n_processes = 8
 
   # Specify the maximum number of files to process
-  max_files_to_process = 20
+  max_files_to_process = 1000000
 
   # allow users to toggle multiprocessing on/off
-  multiprocess = False
+  multiprocess = True
 
   # specify how much padding to add to cropped images
   padding = 5
@@ -736,4 +759,4 @@ def stack_segmented_images():
   store_article_titles()
 
   # Combine the segmented images for each article into one composite image
-  stack_segmented_images()
+  stack_segmented_images()