Skip to content

Commit

Permalink
fixes, but can't get mock to match
Browse files Browse the repository at this point in the history
  • Loading branch information
quadrismegistus committed Dec 5, 2023
1 parent b027e0b commit e5feb3e
Show file tree
Hide file tree
Showing 2 changed files with 121 additions and 52 deletions.
125 changes: 100 additions & 25 deletions ppa/archive/management/commands/generate_textcorpus.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,58 +6,133 @@

import os
import jsonlines
import json
from django.core.management.base import BaseCommand
from ppa.archive.models import DigitizedWork
from parasolr.django import SolrQuerySet
from progressbar import progressbar
import logging
import gzip


class Command(BaseCommand):
"""Custom manage command to generate a text corpus from text indexed in Solr"""

PAGE_OUTPUT_FIELDS = {'id','source_id','group_id_s','content','order','label','tags'}
# fields we want from pages
PAGE_FIELDLIST = [
'page_id:id',
'work_id:group_id_s',
'source_id:source_id',
'page_num:order',
'page_num_orig:label',
'page_tags:tags',
'page_text:content'
]

def add_arguments(self, parser):
"""Build CLI arguments: --path and --doc-limit"""
parser.add_argument(
"--path", required=True, help="Directory path to save corpus file(s)."
)

parser.add_argument(
"--doc-limit",
type=int,
default=-1,
help="Limit on the number of documents for corpus generation."
"The default of -1 considers ALL documents.",
)
parser.add_argument(
"--batch",
type=int,
default=100,
help="Number of docs to save at one time",
)

def iter_solr(self, nsize=10, item_type='page'):
def iter_solr(self, batch_size=1000, item_type='page', lim=None, progress=True):
"""
Iterate over solr documents of a certain `item_type`
"""
i=0
q=SolrQuerySet().search(item_type=item_type)
total = q.count()
for i in progressbar(range(0, total, nsize)):
q.set_limits(i,i+nsize)
yield from q
qset = SolrQuerySet()


def iter_pages(self):
for d in self.iter_solr(item_type='page'):
yield {k:v for k,v in d.items() if k in self.PAGE_OUTPUT_FIELDS}
def get_query(order=True):
q=qset.search(item_type=item_type)
if order: q=q.order_by('id')
if item_type=='page': q=q.only(*self.PAGE_FIELDLIST)
return q

def iter_works(self):
for d in self.iter_solr(item_type='work'):
yield d
q=get_query(order=False)
total = q.count()
if lim and int(total)>lim: total=lim
iterr = range(0, total, batch_size)
if progress:
iterr = progressbar(iterr)
for step in iterr:
q=get_query(order=True)
q.set_limits(step, step+batch_size)
for d in q:
print(d)
yield d

def handle(self, *args, **options):
"""
Run the command, generating metadata.jsonl and pages.jsonl
"""
# options
path = options['path']
print(options)
doclimit = options['doc_limit'] if options['doc_limit']>0 else None
progress = options['verbosity']>0
batch_size = options['batch']
by_batch = batch_size > 1

# paths
os.makedirs(path, exist_ok=True)
path_meta = os.path.join(path,'metadata.jsonl')
path_texts = os.path.join(path,'pages.jsonl')
path_meta = os.path.join(path,'metadata.json')
path_texts = os.path.join(path,'pages.jsonl.gz')

with jsonlines.open(path_meta,'w') as of_meta:
for i,d in enumerate(self.iter_works()):
of_meta.write(d)
if options['doc_limit']>0 and i+1>=options['doc_limit']:
break
# save metadata
def iter_works():
yield from self.iter_solr(
item_type='work',
lim=doclimit,
progress=progress,
# batch_size=batch_size if by_batch else 1000
)

output_ld = list(iter_works())
with open(path_meta,'w') as of:
json.dump(output_ld, of, indent=2)

with jsonlines.open(path_texts,'w') as of_meta:
for i,d in enumerate(self.iter_pages()):
of_meta.write(d)
if options['doc_limit']>0 and i+1>=options['doc_limit']:
break

# save pages
def iter_pages():
yield from self.iter_solr(
item_type='page',
lim=doclimit,
progress=progress,
# batch_size=batch_size if by_batch else 1000
)
### save pages
if not by_batch:
with gzip.open(path_texts,'wt',encoding='utf-8') as of:
for d in iter_pages():
of.write(json.dumps(d)+'\n')
else:
with gzip.open(path_texts,'wt',encoding='utf-8') as of:
batch=[]

def save_batch():
outstr='\n'.join(json.dumps(d) for d in batch) + '\n'
of.write(outstr)

for i,d in enumerate(iter_pages()):
if i and not i%batch_size:
save_batch()
batch=[]
batch.append(d)

if batch:
save_batch()
48 changes: 21 additions & 27 deletions ppa/archive/tests/test_generate_textcorpus.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
from unittest.mock import patch
import json
import pytest
from django.core.management import call_command
from django.core.management.base import CommandError
import os
import jsonlines,json

# mock results for acet query used to get document IDs and page counts
mock_solr_facets = {"group_id_s": {"doc_1": 2, "doc_2": 1}}
Expand All @@ -12,20 +12,22 @@
mock_solr_docs = [
# The first record has item_type='work' and contains metadata for the
# document
{"item_type": "work", "pub_year": 1863, "group_id_s":"doc_1"},
{"item_type": "work", "pub_year": "unknown","group_id_s":"doc_2"},
{"item_type": "work", "pub_year": 1863, "group_id_s":"doc_1", 'id':'yyy'},
{"item_type": "work", "pub_year": "unknown","group_id_s":"doc_2", 'id':'xxx'},
# If multiple metadata rows are found, the first one (above) is used
# Subsequent records have item_type='page', page-order specified by
# 'order', with content in 'content'
{
'id':'yyy.001',
"item_type": "page",
"order": 1,
"content": "Four score and seven years ago our fathers brought forth"
" on this continent, a new nation, ",
"group_id_s":"doc_1",
"label":'i'
"label":'i',
},
{
'id':'xxx.001',
"item_type": "page",
"order": 2,
"content": "conceived in Liberty, and dedicated to the proposition"
Expand All @@ -37,6 +39,7 @@


{
'id':'xxx.001',
"item_type": "page",
"order": 3,
"content": "!!!!!",
Expand All @@ -55,35 +58,26 @@ def patched_solr_queryset(mock_solr_queryset):
"ppa.archive.management.commands.generate_textcorpus.SolrQuerySet", new=mock_qs
) as mock_queryset_cls:
mock_qs = mock_queryset_cls.return_value
mock_qs.get_results.return_value = mock_solr_docs
mock_qs.get_facets.return_value.facet_fields = mock_solr_facets

mock_qs.only.return_value.count.return_value = len(mock_solr_docs)
mock_qs.only.return_value.get_results.return_value = mock_solr_docs
yield mock_qs


def test_save(tmpdir, patched_solr_queryset):
call_command("generate_textcorpus", "--path", tmpdir.dirpath())
metadata_file = tmpdir.dirpath("metadata.csv")
assert metadata_file.check()
call_command("generate_textcorpus", "--path", tmpdir.dirpath(),'--doc-limit',10)
print(os.listdir(tmpdir.dirpath()))

with open(metadata_file) as f:
meta=json.load(f)
assert len(meta) == 2

tdir=tmpdir.dirpath('texts')
fns=os.listdir(tdir)
assert len(fns) == 2

fn1=os.path.join(tdir,fns[0])
fn2=os.path.join(tdir,fns[1])
with open(fn1) as f: ld1=json.load(f)
with open(fn2) as f: ld2=json.load(f)

assert len(ld1)==2
assert len(ld2)==1
metadata_file = tmpdir.dirpath("metadata.json")
pages_file = tmpdir.dirpath("pages.jsonl")
assert metadata_file.check()
with open(metadata_file) as f: meta=json.load(f)
assert len(meta) == 10

assert all(all(bool(v) for k,v in d.items()) for d in ld1)
assert all(all(bool(v) for k,v in d.items()) for d in ld2)
def numlines(fngz):
with gzip.open(fngz,'rt',encoding='utf-8') as f:
return sum(1 for ln in f)

assert numlines(pages_file) > 10



Expand Down

0 comments on commit e5feb3e

Please sign in to comment.