Skip to content

Commit

Permalink
Merge branch 'release/5.13.0'
Browse files Browse the repository at this point in the history
  • Loading branch information
BeritJanssen committed Oct 4, 2024
2 parents 768f91d + 49ba88a commit 895bf09
Show file tree
Hide file tree
Showing 64 changed files with 3,167 additions and 1,647 deletions.
2 changes: 1 addition & 1 deletion CITATION.cff
Original file line number Diff line number Diff line change
Expand Up @@ -35,5 +35,5 @@ keywords:
- elasticsearch
- natural language processing
license: MIT
version: 5.12.0
version: 5.13.0
date-released: '2024-08-30'
8 changes: 2 additions & 6 deletions backend/.gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -43,10 +43,6 @@ ianalyzer/config.py
# csv downloads
download/csv_files/

# word models
corpora/*/wm/*
!corpora/*/wm/documentation.md

# file storage
test_data/
data/
/test_data/
/data/
35 changes: 19 additions & 16 deletions backend/addcorpus/models.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,12 @@
import warnings

from django.contrib import admin
from django.contrib.auth.models import Group
from django.contrib.postgres.fields import ArrayField
from django.core.exceptions import ValidationError
from django.db import models
from django.db.models.constraints import UniqueConstraint

from addcorpus.constants import CATEGORIES, MappingType, VisualizationType
from addcorpus.validation.creation import (
validate_es_mapping, validate_field_language, validate_implication, validate_language_code,
Expand All @@ -12,15 +19,10 @@
)
from addcorpus.validation.indexing import (validate_essential_fields,
validate_has_configuration, validate_language_field, validate_has_data_directory)
from addcorpus.validation.publishing import (validate_default_sort,
validate_ngram_has_date_field)
from django.contrib import admin
from django.contrib.auth.models import Group
from django.contrib.postgres.fields import ArrayField
from django.core.exceptions import ValidationError
from django.db import models
from django.db.models.constraints import UniqueConstraint

from addcorpus.validation.publishing import (
validate_default_sort,
validate_ngram_has_date_field,
)
from ianalyzer.elasticsearch import elasticsearch

MAX_LENGTH_NAME = 126
Expand Down Expand Up @@ -264,14 +266,15 @@ def clean(self):

@property
def has_named_entities(self):
client = elasticsearch(self.es_index)
from es.search import total_hits

client = elasticsearch(self.corpus.name)
try:
mapping = client.indices.get_mapping(
index=self.es_index)
# in production, the index name can be different from the object's es_index value
index_name = list(mapping.keys())[0]
fields = mapping[index_name].get('mappings', {}).get('properties', {}).keys()
if any(field.endswith(':ner') for field in fields):
# we check if any fields exist for filtering named entities
ner_exists = client.search(
index=self.es_index, query={"exists": {"field": "ner:*"}}, size=0
)
if total_hits(ner_exists):
return True
except:
return False
Expand Down
6 changes: 6 additions & 0 deletions backend/addcorpus/python_corpora/corpus.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
from ianalyzer_readers.readers.xml import XMLReader
from ianalyzer_readers.readers.csv import CSVReader
from ianalyzer_readers.readers.html import HTMLReader
from ianalyzer_readers.readers.rdf import RDFReader
from ianalyzer_readers.readers.xlsx import XLSXReader

from addcorpus.python_corpora.filters import Filter
Expand Down Expand Up @@ -340,6 +341,11 @@ def source2dicts(self, source, *nargs, **kwargs):

yield field_dict

class RDFCorpusDefinition(CorpusDefinition, RDFReader):
'''
A RDFCorpus is any corpus that extracts its data from Linked Data files.
'''

# Fields ######################################################################


Expand Down
Loading

0 comments on commit 895bf09

Please sign in to comment.