Skip to content

Commit

Permalink
Merge branch 'develop' into feature/mapdataresults
Browse files Browse the repository at this point in the history
  • Loading branch information
ar-jan committed Oct 4, 2024
2 parents 68d961e + 652d5ab commit 6256fbf
Show file tree
Hide file tree
Showing 54 changed files with 3,008 additions and 1,603 deletions.
8 changes: 2 additions & 6 deletions backend/.gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -43,10 +43,6 @@ ianalyzer/config.py
# csv downloads
download/csv_files/

# word models
corpora/*/wm/*
!corpora/*/wm/documentation.md

# file storage
test_data/
data/
/test_data/
/data/
35 changes: 19 additions & 16 deletions backend/addcorpus/models.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,12 @@
import warnings

from django.contrib import admin
from django.contrib.auth.models import Group
from django.contrib.postgres.fields import ArrayField
from django.core.exceptions import ValidationError
from django.db import models
from django.db.models.constraints import UniqueConstraint

from addcorpus.constants import CATEGORIES, MappingType, VisualizationType
from addcorpus.validation.creation import (
validate_es_mapping, validate_field_language, validate_implication, validate_language_code,
Expand All @@ -12,15 +19,10 @@
)
from addcorpus.validation.indexing import (validate_essential_fields,
validate_has_configuration, validate_language_field, validate_has_data_directory)
from addcorpus.validation.publishing import (validate_default_sort,
validate_ngram_has_date_field)
from django.contrib import admin
from django.contrib.auth.models import Group
from django.contrib.postgres.fields import ArrayField
from django.core.exceptions import ValidationError
from django.db import models
from django.db.models.constraints import UniqueConstraint

from addcorpus.validation.publishing import (
validate_default_sort,
validate_ngram_has_date_field,
)
from ianalyzer.elasticsearch import elasticsearch

MAX_LENGTH_NAME = 126
Expand Down Expand Up @@ -264,14 +266,15 @@ def clean(self):

@property
def has_named_entities(self):
client = elasticsearch(self.es_index)
from es.search import total_hits

client = elasticsearch(self.corpus.name)
try:
mapping = client.indices.get_mapping(
index=self.es_index)
# in production, the index name can be different from the object's es_index value
index_name = list(mapping.keys())[0]
fields = mapping[index_name].get('mappings', {}).get('properties', {}).keys()
if any(field.endswith(':ner') for field in fields):
# we check if any fields exist for filtering named entities
ner_exists = client.search(
index=self.es_index, query={"exists": {"field": "ner:*"}}, size=0
)
if total_hits(ner_exists):
return True
except:
return False
Expand Down
6 changes: 6 additions & 0 deletions backend/addcorpus/python_corpora/corpus.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
from ianalyzer_readers.readers.xml import XMLReader
from ianalyzer_readers.readers.csv import CSVReader
from ianalyzer_readers.readers.html import HTMLReader
from ianalyzer_readers.readers.rdf import RDFReader
from ianalyzer_readers.readers.xlsx import XLSXReader

from addcorpus.python_corpora.filters import Filter
Expand Down Expand Up @@ -340,6 +341,11 @@ def source2dicts(self, source, *nargs, **kwargs):

yield field_dict

class RDFCorpusDefinition(CorpusDefinition, RDFReader):
'''
A RDFCorpus is any corpus that extracts its data from Linked Data files.
'''

# Fields ######################################################################


Expand Down
Loading

0 comments on commit 6256fbf

Please sign in to comment.