Skip to content

Commit

Permalink
adding doc limit back in
Browse files Browse the repository at this point in the history
  • Loading branch information
quadrismegistus committed Dec 4, 2023
1 parent 3f5bd51 commit b027e0b
Showing 1 changed file with 16 additions and 3 deletions.
19 changes: 16 additions & 3 deletions ppa/archive/management/commands/generate_textcorpus.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,13 @@ def add_arguments(self, parser):
parser.add_argument(
"--path", required=True, help="Directory path to save corpus file(s)."
)
parser.add_argument(
"--doc-limit",
type=int,
default=-1,
help="Limit on the number of documents for corpus generation."
"The default of -1 considers ALL documents.",
)

def iter_solr(self, nsize=10, item_type='page'):
i=0
Expand All @@ -42,9 +49,15 @@ def handle(self, *args, **options):
os.makedirs(path, exist_ok=True)
path_meta = os.path.join(path,'metadata.jsonl')
path_texts = os.path.join(path,'pages.jsonl')

with jsonlines.open(path_meta,'w') as of_meta:
for d in self.iter_works():
for i,d in enumerate(self.iter_works()):
of_meta.write(d)
if options['doc_limit']>0 and i+1>=options['doc_limit']:
break

with jsonlines.open(path_texts,'w') as of_meta:
for d in self.iter_pages():
of_meta.write(d)
for i,d in enumerate(self.iter_pages()):
of_meta.write(d)
if options['doc_limit']>0 and i+1>=options['doc_limit']:
break

0 comments on commit b027e0b

Please sign in to comment.