diff --git a/docs/debeir.html b/docs/debeir.html index 27beff9..2e9f1d6 100644 --- a/docs/debeir.html +++ b/docs/debeir.html @@ -50,7 +50,7 @@

The DeBEIR (Dense Bi-Encoder Information Retrieval) source code library.

-

See ./main.py in the parent directory for an out-of-the-box runnable code.

+

See ./examples/ in the parent directory for an out-of-the-box runnable code.

Otherwise, check out notebooks in the parent directory for training your own model amongst other things.

@@ -62,7 +62,7 @@

1"""
 2The DeBEIR (Dense Bi-Encoder Information Retrieval) source code library.
 3
-4See ./main.py in the parent directory for an out-of-the-box runnable code.
+4See ./examples/ in the parent directory for an out-of-the-box runnable code.
 5
 6Otherwise, check out notebooks in the parent directory for training your own model amongst other things.
 7"""
diff --git a/docs/debeir/core.html b/docs/debeir/core.html
index b05bd8c..4e665ca 100644
--- a/docs/debeir/core.html
+++ b/docs/debeir/core.html
@@ -57,7 +57,7 @@ 

Core library interfaces that must be implemented for custom datasets

-

Interfaces to implement custom data_sets in nir.data_sets.

+

Interfaces to implement custom datasets in debeir.datasets.

@@ -67,7 +67,7 @@

1"""
 2Core library interfaces that must be implemented for custom datasets
 3
-4Interfaces to implement custom data_sets in nir.data_sets.
+4Interfaces to implement custom datasets in debeir.datasets.
 5"""
 
diff --git a/docs/debeir/data_sets.html b/docs/debeir/data_sets.html deleted file mode 100644 index 813765b..0000000 --- a/docs/debeir/data_sets.html +++ /dev/null @@ -1,264 +0,0 @@ - - - - - - - debeir.data_sets API documentation - - - - - - - - - -
-
-

-debeir.data_sets

- -

Contains data_sets implemented from nir.interfaces

- -
    -
  1. Parser (For reading data from files into a Dict object)
  2. -
  3. Query object (Generating queries) -
      -
    • These query objects can be very lightweight containing only the mappings of the index.
    • -
  4. -
-
- - - - - -
1"""
-2Contains data_sets implemented from nir.interfaces
-31. Parser (For reading data from files into a Dict object)
-42. Query object (Generating queries)
-5    - These query objects can be very lightweight containing only the mappings of the index.
-6"""
-
- - -
-
- - \ No newline at end of file diff --git a/docs/debeir/data_sets/bioreddit.html b/docs/debeir/data_sets/bioreddit.html deleted file mode 100644 index e0ba72c..0000000 --- a/docs/debeir/data_sets/bioreddit.html +++ /dev/null @@ -1,546 +0,0 @@ - - - - - - - debeir.data_sets.bioreddit API documentation - - - - - - - - - -
-
-

-debeir.data_sets.bioreddit

- - - - - - -
 1from typing import Dict
- 2
- 3from debeir.interfaces.query import GenericElasticsearchQuery
- 4from debeir.interfaces.parser import CSVParser
- 5
- 6
- 7class BioRedditSubmissionParser(CSVParser):
- 8    """
- 9    Parser for the BioReddit Submission Dataset
-10    """
-11    parse_fields = ["id", "body"]
-12
-13    @classmethod
-14    def get_topics(cls, csvfile) -> Dict[int, Dict[str, str]]:
-15        return super().get_topics(csvfile)
-16
-17
-18class BioRedditCommentParser(CSVParser):
-19    """
-20    Parser for the BioReddit Comment Dataset
-21    """
-22    parse_fields = ["id", "parent_id", "selftext", "title"]
-23
-24    @classmethod
-25    def get_topics(cls, csvfile) -> Dict[str, Dict[str, str]]:
-26        topics = super().get_topics(csvfile)
-27        temp = {}
-28
-29        for _, topic in topics.items():
-30            topic["text"] = topic.pop("selftext")
-31            topic["text2"] = topic.pop("title")
-32            temp[topic["id"]] = topic
-33
-34        return temp
-35
-36
-37class BioRedditElasticsearchQuery(GenericElasticsearchQuery):
-38    """
-39    Elasticsearch Query object for the BioReddit
-40    """
-41    def __init__(self, topics, config, *args, **kwargs):
-42        super().__init__(topics, config, *args, **kwargs)
-43        self.mappings = ["Text"]
-44
-45        self.topics = topics
-46        self.config = config
-47        self.query_type = self.config.query_type
-48
-49        self.embed_mappings = ["Text_Embedding"]
-50
-51        self.query_funcs = {
-52            "query": self.generate_query,
-53            "embedding": self.generate_query_embedding,
-54        }
-
- - -
-
- -
- - class - BioRedditSubmissionParser(debeir.interfaces.parser.CSVParser): - - - -
- -
 8class BioRedditSubmissionParser(CSVParser):
- 9    """
-10    Parser for the BioReddit Submission Dataset
-11    """
-12    parse_fields = ["id", "body"]
-13
-14    @classmethod
-15    def get_topics(cls, csvfile) -> Dict[int, Dict[str, str]]:
-16        return super().get_topics(csvfile)
-
- - -

Parser for the BioReddit Submission Dataset

-
- - -
- -
-
@classmethod
- - def - get_topics(cls, csvfile) -> Dict[int, Dict[str, str]]: - - - -
- -
14    @classmethod
-15    def get_topics(cls, csvfile) -> Dict[int, Dict[str, str]]:
-16        return super().get_topics(csvfile)
-
- - -

Instance method for getting topics, forwards instance self parameters to the _get_topics class method.

-
- - -
- -
-
- -
- - class - BioRedditCommentParser(debeir.interfaces.parser.CSVParser): - - - -
- -
19class BioRedditCommentParser(CSVParser):
-20    """
-21    Parser for the BioReddit Comment Dataset
-22    """
-23    parse_fields = ["id", "parent_id", "selftext", "title"]
-24
-25    @classmethod
-26    def get_topics(cls, csvfile) -> Dict[str, Dict[str, str]]:
-27        topics = super().get_topics(csvfile)
-28        temp = {}
-29
-30        for _, topic in topics.items():
-31            topic["text"] = topic.pop("selftext")
-32            topic["text2"] = topic.pop("title")
-33            temp[topic["id"]] = topic
-34
-35        return temp
-
- - -

Parser for the BioReddit Comment Dataset

-
- - -
- -
-
@classmethod
- - def - get_topics(cls, csvfile) -> Dict[str, Dict[str, str]]: - - - -
- -
25    @classmethod
-26    def get_topics(cls, csvfile) -> Dict[str, Dict[str, str]]:
-27        topics = super().get_topics(csvfile)
-28        temp = {}
-29
-30        for _, topic in topics.items():
-31            topic["text"] = topic.pop("selftext")
-32            topic["text2"] = topic.pop("title")
-33            temp[topic["id"]] = topic
-34
-35        return temp
-
- - -

Instance method for getting topics, forwards instance self parameters to the _get_topics class method.

-
- - -
- -
-
- -
- - class - BioRedditElasticsearchQuery(debeir.interfaces.query.GenericElasticsearchQuery): - - - -
- -
38class BioRedditElasticsearchQuery(GenericElasticsearchQuery):
-39    """
-40    Elasticsearch Query object for the BioReddit
-41    """
-42    def __init__(self, topics, config, *args, **kwargs):
-43        super().__init__(topics, config, *args, **kwargs)
-44        self.mappings = ["Text"]
-45
-46        self.topics = topics
-47        self.config = config
-48        self.query_type = self.config.query_type
-49
-50        self.embed_mappings = ["Text_Embedding"]
-51
-52        self.query_funcs = {
-53            "query": self.generate_query,
-54            "embedding": self.generate_query_embedding,
-55        }
-
- - -

Elasticsearch Query object for the BioReddit

-
- - -
- -
- - BioRedditElasticsearchQuery(topics, config, *args, **kwargs) - - - -
- -
42    def __init__(self, topics, config, *args, **kwargs):
-43        super().__init__(topics, config, *args, **kwargs)
-44        self.mappings = ["Text"]
-45
-46        self.topics = topics
-47        self.config = config
-48        self.query_type = self.config.query_type
-49
-50        self.embed_mappings = ["Text_Embedding"]
-51
-52        self.query_funcs = {
-53            "query": self.generate_query,
-54            "embedding": self.generate_query_embedding,
-55        }
-
- - - - -
- -
-
- - \ No newline at end of file diff --git a/docs/debeir/data_sets/clinical_trials.html b/docs/debeir/data_sets/clinical_trials.html deleted file mode 100644 index 525d0b6..0000000 --- a/docs/debeir/data_sets/clinical_trials.html +++ /dev/null @@ -1,2111 +0,0 @@ - - - - - - - debeir.data_sets.clinical_trials API documentation - - - - - - - - - -
-
-

-debeir.data_sets.clinical_trials

- - - - - - -
  1import csv
-  2import loguru
-  3
-  4from dataclasses import dataclass
-  5from typing import Dict, Union, Optional, List
-  6from elasticsearch import AsyncElasticsearch as Elasticsearch
-  7
-  8from debeir.interfaces.executor import GenericElasticsearchExecutor
-  9from debeir.interfaces.query import GenericElasticsearchQuery
- 10from debeir.engines.elasticsearch.generate_script_score import generate_script
- 11
- 12from debeir.interfaces.config import GenericConfig, apply_config
- 13from debeir.interfaces.parser import Parser
- 14from debeir.rankers.transformer_sent_encoder import Encoder
- 15from debeir.utils.scaler import get_z_value
- 16
- 17
- 18@dataclass(init=True, unsafe_hash=True)
- 19class TrialsQueryConfig(GenericConfig):
- 20    query_field_usage: str = None
- 21    embed_field_usage: str = None
- 22    fields: List[str] = None
- 23
- 24    def validate(self):
- 25        """
- 26        Checks if query type is included, and checks if an encoder is included for embedding queries
- 27        """
- 28        if self.query_type == "embedding":
- 29            assert self.query_field_usage and self.embed_field_usage, (
- 30                "Must have both field usages" " if embedding query"
- 31            )
- 32            assert (
- 33                self.encoder_fp and self.encoder
- 34            ), "Must provide encoder path for embedding model"
- 35            assert self.norm_weight is not None or self.automatic is not None, (
- 36                "Norm weight be specified or be " "automatic "
- 37            )
- 38
- 39        assert (
- 40            self.query_field_usage is not None or self.fields is not None
- 41        ), "Must have a query field"
- 42        assert self.query_type in [
- 43            "ablation",
- 44            "query",
- 45            "query_best",
- 46            "embedding",
- 47        ], "Check your query type"
- 48
- 49    @classmethod
- 50    def from_toml(cls, fp: str, *args, **kwargs) -> "GenericConfig":
- 51        return super().from_toml(fp, cls, *args, **kwargs)
- 52
- 53    @classmethod
- 54    def from_dict(cls, **kwargs) -> "GenericConfig":
- 55        return super().from_dict(cls, **kwargs)
- 56
- 57
- 58class TrialsElasticsearchQuery(GenericElasticsearchQuery):
- 59    """
- 60    Elasticsearch Query object for the Clinical Trials Index
- 61    """
- 62    topics: Dict[int, Dict[str, str]]
- 63    query_type: str
- 64    fields: List[int]
- 65    query_funcs: Dict
- 66    config: GenericConfig
- 67    id_mapping: str = "_id"
- 68    mappings: List[str]
- 69    config: TrialsQueryConfig
- 70
- 71    def __init__(self, topics, query_type, config=None, *args, **kwargs):
- 72        super().__init__(topics, config, *args, **kwargs)
- 73        self.query_type = query_type
- 74        self.config = config
- 75        self.topics = topics
- 76        self.fields = []
- 77        self.mappings = [
- 78            "HasExpandedAccess",
- 79            "BriefSummary.Textblock",
- 80            "CompletionDate.Type",
- 81            "OversightInfo.Text",
- 82            "OverallContactBackup.PhoneExt",
- 83            "RemovedCountries.Text",
- 84            "SecondaryOutcome",
- 85            "Sponsors.LeadSponsor.Text",
- 86            "BriefTitle",
- 87            "IDInfo.NctID",
- 88            "IDInfo.SecondaryID",
- 89            "OverallContactBackup.Phone",
- 90            "Eligibility.StudyPop.Textblock",
- 91            "DetailedDescription.Textblock",
- 92            "Eligibility.MinimumAge",
- 93            "Sponsors.Collaborator",
- 94            "Reference",
- 95            "Eligibility.Criteria.Textblock",
- 96            "XMLName.Space",
- 97            "Rank",
- 98            "OverallStatus",
- 99            "InterventionBrowse.Text",
-100            "Eligibility.Text",
-101            "Intervention",
-102            "BiospecDescr.Textblock",
-103            "ResponsibleParty.NameTitle",
-104            "NumberOfArms",
-105            "ResponsibleParty.ResponsiblePartyType",
-106            "IsSection801",
-107            "Acronym",
-108            "Eligibility.MaximumAge",
-109            "DetailedDescription.Text",
-110            "StudyDesign",
-111            "OtherOutcome",
-112            "VerificationDate",
-113            "ConditionBrowse.MeshTerm",
-114            "Enrollment.Text",
-115            "IDInfo.Text",
-116            "ConditionBrowse.Text",
-117            "FirstreceivedDate",
-118            "NumberOfGroups",
-119            "OversightInfo.HasDmc",
-120            "PrimaryCompletionDate.Text",
-121            "ResultsReference",
-122            "Eligibility.StudyPop.Text",
-123            "IsFdaRegulated",
-124            "WhyStopped",
-125            "ArmGroup",
-126            "OverallContact.LastName",
-127            "Phase",
-128            "RemovedCountries.Country",
-129            "InterventionBrowse.MeshTerm",
-130            "Eligibility.HealthyVolunteers",
-131            "Location",
-132            "OfficialTitle",
-133            "OverallContact.Email",
-134            "RequiredHeader.Text",
-135            "RequiredHeader.URL",
-136            "LocationCountries.Country",
-137            "OverallContact.PhoneExt",
-138            "Condition",
-139            "PrimaryOutcome",
-140            "LocationCountries.Text",
-141            "BiospecDescr.Text",
-142            "IDInfo.OrgStudyID",
-143            "Link",
-144            "OverallContact.Phone",
-145            "Source",
-146            "ResponsibleParty.InvestigatorAffiliation",
-147            "StudyType",
-148            "FirstreceivedResultsDate",
-149            "Enrollment.Type",
-150            "Eligibility.Gender",
-151            "OverallContactBackup.LastName",
-152            "Keyword",
-153            "BiospecRetention",
-154            "CompletionDate.Text",
-155            "OverallContact.Text",
-156            "RequiredHeader.DownloadDate",
-157            "Sponsors.Text",
-158            "Text",
-159            "Eligibility.SamplingMethod",
-160            "LastchangedDate",
-161            "ResponsibleParty.InvestigatorFullName",
-162            "StartDate",
-163            "RequiredHeader.LinkText",
-164            "OverallOfficial",
-165            "Sponsors.LeadSponsor.AgencyClass",
-166            "OverallContactBackup.Text",
-167            "Eligibility.Criteria.Text",
-168            "XMLName.Local",
-169            "OversightInfo.Authority",
-170            "PrimaryCompletionDate.Type",
-171            "ResponsibleParty.Organization",
-172            "IDInfo.NctAlias",
-173            "ResponsibleParty.Text",
-174            "TargetDuration",
-175            "Sponsors.LeadSponsor.Agency",
-176            "BriefSummary.Text",
-177            "OverallContactBackup.Email",
-178            "ResponsibleParty.InvestigatorTitle",
-179        ]
-180
-181        self.best_recall_fields = [
-182            "LocationCountries.Country",
-183            "BiospecRetention",
-184            "DetailedDescription.Textblock",
-185            "HasExpandedAccess",
-186            "ConditionBrowse.MeshTerm",
-187            "RequiredHeader.LinkText",
-188            "WhyStopped",
-189            "BriefSummary.Textblock",
-190            "Eligibility.Criteria.Textblock",
-191            "OfficialTitle",
-192            "Eligibility.MaximumAge",
-193            "Eligibility.StudyPop.Textblock",
-194            "BiospecDescr.Textblock",
-195            "BriefTitle",
-196            "Eligibility.MinimumAge",
-197            "ResponsibleParty.Organization",
-198            "TargetDuration",
-199            "Condition",
-200            "IDInfo.OrgStudyID",
-201            "Keyword",
-202            "Source",
-203            "Sponsors.LeadSponsor.Agency",
-204            "ResponsibleParty.InvestigatorAffiliation",
-205            "OversightInfo.Authority",
-206            "OversightInfo.HasDmc",
-207            "OverallContact.Phone",
-208            "Phase",
-209            "OverallContactBackup.LastName",
-210            "Acronym",
-211            "InterventionBrowse.MeshTerm",
-212            "RemovedCountries.Country",
-213        ]
-214        self.best_map_fields = [
-215            "Eligibility.Gender",
-216            "LocationCountries.Country",
-217            "DetailedDescription.Textblock",
-218            "BriefSummary.Textblock",
-219            "ConditionBrowse.MeshTerm",
-220            "Eligibility.Criteria.Textblock",
-221            "InterventionBrowse.MeshTerm",
-222            "StudyType",
-223            "IsFdaRegulated",
-224            "HasExpandedAccess",
-225            "RequiredHeader.LinkText",
-226            "BiospecRetention",
-227            "OfficialTitle",
-228            "Eligibility.SamplingMethod",
-229            "Eligibility.StudyPop.Textblock",
-230            "Condition",
-231            "Eligibility.MinimumAge",
-232            "Keyword",
-233            "Eligibility.MaximumAge",
-234            "BriefTitle",
-235        ]
-236        self.best_embed_fields = [
-237            "WhyStopped",
-238            "HasExpandedAccess",
-239            "BiospecRetention",
-240            "BriefSummary.Textblock",
-241            "LocationCountries.Country",
-242            "ConditionBrowse.MeshTerm",
-243            "DetailedDescription.Textblock",
-244            "RequiredHeader.LinkText",
-245            "Eligibility.Criteria.Textblock",
-246        ]
-247
-248        self.sensible = [
-249            "BriefSummary.Textblock" "BriefTitle",
-250            "Eligibility.StudyPop.Textblock",
-251            "DetailedDescription.Textblock",
-252            "Eligibility.MinimumAge",
-253            "Eligibility.Criteria.Textblock",
-254            "InterventionBrowse.Text",
-255            "Eligibility.Text",
-256            "BiospecDescr.Textblock",
-257            "Eligibility.MaximumAge",
-258            "DetailedDescription.Text",
-259            "ConditionBrowse.MeshTerm",
-260            "ConditionBrowse.Text",
-261            "Eligibility.StudyPop.Text",
-262            "InterventionBrowse.MeshTerm",
-263            "OfficialTitle",
-264            "Condition",
-265            "PrimaryOutcome",
-266            "BiospecDescr.Text",
-267            "Eligibility.Gender",
-268            "Keyword",
-269            "BiospecRetention",
-270            "Eligibility.Criteria.Text",
-271            "BriefSummary.Text",
-272        ]
-273
-274        self.sensible_embed = [
-275            "BriefSummary.Textblock" "BriefTitle",
-276            "Eligibility.StudyPop.Textblock",
-277            "DetailedDescription.Textblock",
-278            "Eligibility.Criteria.Textblock",
-279            "InterventionBrowse.Text",
-280            "Eligibility.Text",
-281            "BiospecDescr.Textblock",
-282            "DetailedDescription.Text",
-283            "ConditionBrowse.MeshTerm",
-284            "ConditionBrowse.Text",
-285            "Eligibility.StudyPop.Text",
-286            "InterventionBrowse.MeshTerm",
-287            "OfficialTitle",
-288            "Condition",
-289            "PrimaryOutcome",
-290            "BiospecDescr.Text",
-291            "Keyword",
-292            "BiospecRetention",
-293            "Eligibility.Criteria.Text",
-294            "BriefSummary.Text",
-295        ]
-296
-297        self.sensible_embed_safe = list(
-298            set(self.best_recall_fields).intersection(set(self.sensible_embed))
-299        )
-300
-301        self.query_funcs = {
-302            "query": self.generate_query,
-303            "ablation": self.generate_query_ablation,
-304            "embedding": self.generate_query_embedding,
-305        }
-306
-307        loguru.logger.debug(self.sensible_embed_safe)
-308
-309        self.field_usage = {
-310            "best_recall_fields": self.best_recall_fields,
-311            "all": self.mappings,
-312            "best_map_fields": self.best_map_fields,
-313            "best_embed_fields": self.best_embed_fields,
-314            "sensible": self.sensible,
-315            "sensible_embed": self.sensible_embed,
-316            "sensible_embed_safe": self.sensible_embed_safe,
-317        }
-318
-319    @apply_config
-320    def generate_query(self, topic_num, query_field_usage, **kwargs) -> Dict:
-321        """
-322        Generates a query for the clinical trials index
-323
-324        :param topic_num: Topic number to search
-325        :param query_field_usage: Which document facets to search over
-326        :param kwargs:
-327        :return:
-328            A basic elasticsearch query for clinical trials
-329        """
-330        fields = self.field_usage[query_field_usage]
-331        should = {"should": []}
-332
-333        qfield = list(self.topics[topic_num].keys())[0]
-334        query = self.topics[topic_num][qfield]
-335
-336        for i, field in enumerate(fields):
-337            should["should"].append(
-338                {
-339                    "match": {
-340                        f"{field}": {
-341                            "query": query,
-342                        }
-343                    }
-344                }
-345            )
-346
-347        query = {
-348            "query": {
-349                "bool": should,
-350            }
-351        }
-352
-353        return query
-354
-355    def generate_query_ablation(self, topic_num, **kwargs):
-356        """
-357        Only search one document facet at a time
-358        :param topic_num:
-359        :param kwargs:
-360        :return:
-361        """
-362        query = {"query": {"match": {}}}
-363
-364        for field in self.fields:
-365            query["query"]["match"][self.mappings[field]] = ""
-366
-367        for qfield in self.fields:
-368            qfield = self.mappings[qfield]
-369            for field in self.topics[topic_num]:
-370                query["query"]["match"][qfield] += self.topics[topic_num][field]
-371
-372        return query
-373
-374    @apply_config
-375    def generate_query_embedding(
-376        self,
-377        topic_num,
-378        encoder,
-379        query_field_usage,
-380        embed_field_usage,
-381        cosine_weights: List[float] = None,
-382        query_weight: List[float] = None,
-383        norm_weight=2.15,
-384        ablations=False,
-385        automatic_scores=None,
-386        **kwargs,
-387    ):
-388        """
-389        Computes the NIR score for a given topic
-390
-391        Score = log(BM25)/log(norm_weight) + embedding_score
-392
-393        :param topic_num:
-394        :param encoder:
-395        :param query_field_usage:
-396        :param embed_field_usage:
-397        :param cosine_weights:
-398        :param query_weight:
-399        :param norm_weight:
-400        :param ablations:
-401        :param automatic_scores:
-402        :param kwargs:
-403        :return:
-404        """
-405        should = {"should": []}
-406
-407        assert norm_weight or automatic_scores
-408
-409        query_fields = self.field_usage[query_field_usage]
-410        embed_fields = self.field_usage[embed_field_usage]
-411
-412        qfield = list(self.topics[topic_num].keys())[0]
-413        query = self.topics[topic_num][qfield]
-414
-415        for i, field in enumerate(query_fields):
-416            should["should"].append(
-417                {
-418                    "match": {
-419                        f"{field}": {
-420                            "query": query,
-421                            "boost": query_weight[i] if query_weight else 1,
-422                        }
-423                    }
-424                }
-425            )
-426
-427        if automatic_scores is not None:
-428            norm_weight = get_z_value(
-429                cosine_ceiling=len(embed_fields) * len(query_fields),
-430                bm25_ceiling=automatic_scores[topic_num],
-431            )
-432
-433        params = {
-434            "weights": cosine_weights if cosine_weights else [1] * len(embed_fields),
-435            "q_eb": encoder.encode(self.topics[topic_num][qfield]),
-436            "offset": 1.0,
-437            "norm_weight": norm_weight,
-438            "disable_bm25": ablations,
-439        }
-440
-441        query = {
-442            "query": {
-443                "script_score": {
-444                    "query": {
-445                        "bool": should,
-446                    },
-447                    "script": generate_script(self.best_embed_fields, params=params),
-448                },
-449            }
-450        }
-451
-452        return query
-453
-454    def get_query_type(self, *args, **kwargs):
-455        return self.query_funcs[self.query_type](*args, **kwargs)
-456
-457    def get_id_mapping(self, hit):
-458        return hit[self.id_mapping]
-459
-460
-461class ClinicalTrialsElasticsearchExecutor(GenericElasticsearchExecutor):
-462    """
-463    Executes queries given a query object.
-464    """
-465    query: TrialsElasticsearchQuery
-466
-467    def __init__(
-468        self,
-469        topics: Dict[Union[str, int], Dict[str, str]],
-470        client: Elasticsearch,
-471        index_name: str,
-472        output_file: str,
-473        query: TrialsElasticsearchQuery,
-474        encoder: Optional[Encoder] = None,
-475        config=None,
-476        *args,
-477        **kwargs,
-478    ):
-479
-480        super().__init__(
-481            topics,
-482            client,
-483            index_name,
-484            output_file,
-485            query,
-486            encoder,
-487            config=config,
-488            *args,
-489            **kwargs,
-490        )
-491
-492        self.query_fns = {
-493            "query": self.generate_query,
-494            "ablation": self.generate_query_ablation,
-495            "embedding": self.generate_embedding_query,
-496        }
-497
-498
-499class ClinicalTrialParser(Parser):
-500    """
-501    Parser for Clinical Trials topics
-502    """
-503    @classmethod
-504    def get_topics(cls, csvfile) -> Dict[int, Dict[str, str]]:
-505        topics = {}
-506        reader = csv.reader(csvfile)
-507        for i, row in enumerate(reader):
-508            if i == 0:
-509                continue
-510
-511            _id = row[0]
-512            text = row[1]
-513
-514            topics[_id] = {"text": text}
-515
-516        return topics
-
- - -
-
- -
-
@dataclass(init=True, unsafe_hash=True)
- - class - TrialsQueryConfig(debeir.interfaces.config.GenericConfig): - - - -
- -
19@dataclass(init=True, unsafe_hash=True)
-20class TrialsQueryConfig(GenericConfig):
-21    query_field_usage: str = None
-22    embed_field_usage: str = None
-23    fields: List[str] = None
-24
-25    def validate(self):
-26        """
-27        Checks if query type is included, and checks if an encoder is included for embedding queries
-28        """
-29        if self.query_type == "embedding":
-30            assert self.query_field_usage and self.embed_field_usage, (
-31                "Must have both field usages" " if embedding query"
-32            )
-33            assert (
-34                self.encoder_fp and self.encoder
-35            ), "Must provide encoder path for embedding model"
-36            assert self.norm_weight is not None or self.automatic is not None, (
-37                "Norm weight be specified or be " "automatic "
-38            )
-39
-40        assert (
-41            self.query_field_usage is not None or self.fields is not None
-42        ), "Must have a query field"
-43        assert self.query_type in [
-44            "ablation",
-45            "query",
-46            "query_best",
-47            "embedding",
-48        ], "Check your query type"
-49
-50    @classmethod
-51    def from_toml(cls, fp: str, *args, **kwargs) -> "GenericConfig":
-52        return super().from_toml(fp, cls, *args, **kwargs)
-53
-54    @classmethod
-55    def from_dict(cls, **kwargs) -> "GenericConfig":
-56        return super().from_dict(cls, **kwargs)
-
- - - - -
-
- - TrialsQueryConfig( query_type: str, index: str = None, encoder_normalize: bool = True, ablations: bool = False, norm_weight: float = None, automatic: bool = None, encoder: object = None, encoder_fp: str = None, query_weights: List[float] = None, cosine_weights: List[float] = None, evaluate: bool = False, qrels: str = None, config_fn: str = None, query_fn: str = None, parser_fn: str = None, executor_fn: str = None, cosine_ceiling: float = None, topics_path: str = None, return_id_only: bool = False, overwrite_output_if_exists: bool = False, output_file: str = None, run_name: str = None, query_field_usage: str = None, embed_field_usage: str = None, fields: List[str] = None) - - -
- - - - -
-
- -
- - def - validate(self): - - - -
- -
25    def validate(self):
-26        """
-27        Checks if query type is included, and checks if an encoder is included for embedding queries
-28        """
-29        if self.query_type == "embedding":
-30            assert self.query_field_usage and self.embed_field_usage, (
-31                "Must have both field usages" " if embedding query"
-32            )
-33            assert (
-34                self.encoder_fp and self.encoder
-35            ), "Must provide encoder path for embedding model"
-36            assert self.norm_weight is not None or self.automatic is not None, (
-37                "Norm weight be specified or be " "automatic "
-38            )
-39
-40        assert (
-41            self.query_field_usage is not None or self.fields is not None
-42        ), "Must have a query field"
-43        assert self.query_type in [
-44            "ablation",
-45            "query",
-46            "query_best",
-47            "embedding",
-48        ], "Check your query type"
-
- - -

Checks if query type is included, and checks if an encoder is included for embedding queries

-
- - -
-
- -
-
@classmethod
- - def - from_toml(cls, fp: str, *args, **kwargs) -> debeir.interfaces.config.GenericConfig: - - - -
- -
50    @classmethod
-51    def from_toml(cls, fp: str, *args, **kwargs) -> "GenericConfig":
-52        return super().from_toml(fp, cls, *args, **kwargs)
-
- - -

Instantiates a Config object from a toml file

- -
Parameters
- -
    -
  • fp: File path of the Config TOML file
  • -
  • field_class: Class of the Config object to be instantiated
  • -
  • args: Arguments to be passed to Config
  • -
  • kwargs: Keyword arguments to be passed
  • -
- -
Returns
- -
-
A instantiated and validated Config object.
-
-
-
- - -
-
- -
-
@classmethod
- - def - from_dict(cls, **kwargs) -> debeir.interfaces.config.GenericConfig: - - - -
- -
54    @classmethod
-55    def from_dict(cls, **kwargs) -> "GenericConfig":
-56        return super().from_dict(cls, **kwargs)
-
- - -

Instantiates a Config object from a dictionary

- -
Parameters
- -
    -
  • data_class:
  • -
  • kwargs:
  • -
- -
Returns
-
- - -
-
-
Inherited Members
-
- -
-
-
-
- -
- - class - TrialsElasticsearchQuery(debeir.interfaces.query.GenericElasticsearchQuery): - - - -
- -
 59class TrialsElasticsearchQuery(GenericElasticsearchQuery):
- 60    """
- 61    Elasticsearch Query object for the Clinical Trials Index
- 62    """
- 63    topics: Dict[int, Dict[str, str]]
- 64    query_type: str
- 65    fields: List[int]
- 66    query_funcs: Dict
- 67    config: GenericConfig
- 68    id_mapping: str = "_id"
- 69    mappings: List[str]
- 70    config: TrialsQueryConfig
- 71
- 72    def __init__(self, topics, query_type, config=None, *args, **kwargs):
- 73        super().__init__(topics, config, *args, **kwargs)
- 74        self.query_type = query_type
- 75        self.config = config
- 76        self.topics = topics
- 77        self.fields = []
- 78        self.mappings = [
- 79            "HasExpandedAccess",
- 80            "BriefSummary.Textblock",
- 81            "CompletionDate.Type",
- 82            "OversightInfo.Text",
- 83            "OverallContactBackup.PhoneExt",
- 84            "RemovedCountries.Text",
- 85            "SecondaryOutcome",
- 86            "Sponsors.LeadSponsor.Text",
- 87            "BriefTitle",
- 88            "IDInfo.NctID",
- 89            "IDInfo.SecondaryID",
- 90            "OverallContactBackup.Phone",
- 91            "Eligibility.StudyPop.Textblock",
- 92            "DetailedDescription.Textblock",
- 93            "Eligibility.MinimumAge",
- 94            "Sponsors.Collaborator",
- 95            "Reference",
- 96            "Eligibility.Criteria.Textblock",
- 97            "XMLName.Space",
- 98            "Rank",
- 99            "OverallStatus",
-100            "InterventionBrowse.Text",
-101            "Eligibility.Text",
-102            "Intervention",
-103            "BiospecDescr.Textblock",
-104            "ResponsibleParty.NameTitle",
-105            "NumberOfArms",
-106            "ResponsibleParty.ResponsiblePartyType",
-107            "IsSection801",
-108            "Acronym",
-109            "Eligibility.MaximumAge",
-110            "DetailedDescription.Text",
-111            "StudyDesign",
-112            "OtherOutcome",
-113            "VerificationDate",
-114            "ConditionBrowse.MeshTerm",
-115            "Enrollment.Text",
-116            "IDInfo.Text",
-117            "ConditionBrowse.Text",
-118            "FirstreceivedDate",
-119            "NumberOfGroups",
-120            "OversightInfo.HasDmc",
-121            "PrimaryCompletionDate.Text",
-122            "ResultsReference",
-123            "Eligibility.StudyPop.Text",
-124            "IsFdaRegulated",
-125            "WhyStopped",
-126            "ArmGroup",
-127            "OverallContact.LastName",
-128            "Phase",
-129            "RemovedCountries.Country",
-130            "InterventionBrowse.MeshTerm",
-131            "Eligibility.HealthyVolunteers",
-132            "Location",
-133            "OfficialTitle",
-134            "OverallContact.Email",
-135            "RequiredHeader.Text",
-136            "RequiredHeader.URL",
-137            "LocationCountries.Country",
-138            "OverallContact.PhoneExt",
-139            "Condition",
-140            "PrimaryOutcome",
-141            "LocationCountries.Text",
-142            "BiospecDescr.Text",
-143            "IDInfo.OrgStudyID",
-144            "Link",
-145            "OverallContact.Phone",
-146            "Source",
-147            "ResponsibleParty.InvestigatorAffiliation",
-148            "StudyType",
-149            "FirstreceivedResultsDate",
-150            "Enrollment.Type",
-151            "Eligibility.Gender",
-152            "OverallContactBackup.LastName",
-153            "Keyword",
-154            "BiospecRetention",
-155            "CompletionDate.Text",
-156            "OverallContact.Text",
-157            "RequiredHeader.DownloadDate",
-158            "Sponsors.Text",
-159            "Text",
-160            "Eligibility.SamplingMethod",
-161            "LastchangedDate",
-162            "ResponsibleParty.InvestigatorFullName",
-163            "StartDate",
-164            "RequiredHeader.LinkText",
-165            "OverallOfficial",
-166            "Sponsors.LeadSponsor.AgencyClass",
-167            "OverallContactBackup.Text",
-168            "Eligibility.Criteria.Text",
-169            "XMLName.Local",
-170            "OversightInfo.Authority",
-171            "PrimaryCompletionDate.Type",
-172            "ResponsibleParty.Organization",
-173            "IDInfo.NctAlias",
-174            "ResponsibleParty.Text",
-175            "TargetDuration",
-176            "Sponsors.LeadSponsor.Agency",
-177            "BriefSummary.Text",
-178            "OverallContactBackup.Email",
-179            "ResponsibleParty.InvestigatorTitle",
-180        ]
-181
-182        self.best_recall_fields = [
-183            "LocationCountries.Country",
-184            "BiospecRetention",
-185            "DetailedDescription.Textblock",
-186            "HasExpandedAccess",
-187            "ConditionBrowse.MeshTerm",
-188            "RequiredHeader.LinkText",
-189            "WhyStopped",
-190            "BriefSummary.Textblock",
-191            "Eligibility.Criteria.Textblock",
-192            "OfficialTitle",
-193            "Eligibility.MaximumAge",
-194            "Eligibility.StudyPop.Textblock",
-195            "BiospecDescr.Textblock",
-196            "BriefTitle",
-197            "Eligibility.MinimumAge",
-198            "ResponsibleParty.Organization",
-199            "TargetDuration",
-200            "Condition",
-201            "IDInfo.OrgStudyID",
-202            "Keyword",
-203            "Source",
-204            "Sponsors.LeadSponsor.Agency",
-205            "ResponsibleParty.InvestigatorAffiliation",
-206            "OversightInfo.Authority",
-207            "OversightInfo.HasDmc",
-208            "OverallContact.Phone",
-209            "Phase",
-210            "OverallContactBackup.LastName",
-211            "Acronym",
-212            "InterventionBrowse.MeshTerm",
-213            "RemovedCountries.Country",
-214        ]
-215        self.best_map_fields = [
-216            "Eligibility.Gender",
-217            "LocationCountries.Country",
-218            "DetailedDescription.Textblock",
-219            "BriefSummary.Textblock",
-220            "ConditionBrowse.MeshTerm",
-221            "Eligibility.Criteria.Textblock",
-222            "InterventionBrowse.MeshTerm",
-223            "StudyType",
-224            "IsFdaRegulated",
-225            "HasExpandedAccess",
-226            "RequiredHeader.LinkText",
-227            "BiospecRetention",
-228            "OfficialTitle",
-229            "Eligibility.SamplingMethod",
-230            "Eligibility.StudyPop.Textblock",
-231            "Condition",
-232            "Eligibility.MinimumAge",
-233            "Keyword",
-234            "Eligibility.MaximumAge",
-235            "BriefTitle",
-236        ]
-237        self.best_embed_fields = [
-238            "WhyStopped",
-239            "HasExpandedAccess",
-240            "BiospecRetention",
-241            "BriefSummary.Textblock",
-242            "LocationCountries.Country",
-243            "ConditionBrowse.MeshTerm",
-244            "DetailedDescription.Textblock",
-245            "RequiredHeader.LinkText",
-246            "Eligibility.Criteria.Textblock",
-247        ]
-248
-249        self.sensible = [
-250            "BriefSummary.Textblock" "BriefTitle",
-251            "Eligibility.StudyPop.Textblock",
-252            "DetailedDescription.Textblock",
-253            "Eligibility.MinimumAge",
-254            "Eligibility.Criteria.Textblock",
-255            "InterventionBrowse.Text",
-256            "Eligibility.Text",
-257            "BiospecDescr.Textblock",
-258            "Eligibility.MaximumAge",
-259            "DetailedDescription.Text",
-260            "ConditionBrowse.MeshTerm",
-261            "ConditionBrowse.Text",
-262            "Eligibility.StudyPop.Text",
-263            "InterventionBrowse.MeshTerm",
-264            "OfficialTitle",
-265            "Condition",
-266            "PrimaryOutcome",
-267            "BiospecDescr.Text",
-268            "Eligibility.Gender",
-269            "Keyword",
-270            "BiospecRetention",
-271            "Eligibility.Criteria.Text",
-272            "BriefSummary.Text",
-273        ]
-274
-275        self.sensible_embed = [
-276            "BriefSummary.Textblock" "BriefTitle",
-277            "Eligibility.StudyPop.Textblock",
-278            "DetailedDescription.Textblock",
-279            "Eligibility.Criteria.Textblock",
-280            "InterventionBrowse.Text",
-281            "Eligibility.Text",
-282            "BiospecDescr.Textblock",
-283            "DetailedDescription.Text",
-284            "ConditionBrowse.MeshTerm",
-285            "ConditionBrowse.Text",
-286            "Eligibility.StudyPop.Text",
-287            "InterventionBrowse.MeshTerm",
-288            "OfficialTitle",
-289            "Condition",
-290            "PrimaryOutcome",
-291            "BiospecDescr.Text",
-292            "Keyword",
-293            "BiospecRetention",
-294            "Eligibility.Criteria.Text",
-295            "BriefSummary.Text",
-296        ]
-297
-298        self.sensible_embed_safe = list(
-299            set(self.best_recall_fields).intersection(set(self.sensible_embed))
-300        )
-301
-302        self.query_funcs = {
-303            "query": self.generate_query,
-304            "ablation": self.generate_query_ablation,
-305            "embedding": self.generate_query_embedding,
-306        }
-307
-308        loguru.logger.debug(self.sensible_embed_safe)
-309
-310        self.field_usage = {
-311            "best_recall_fields": self.best_recall_fields,
-312            "all": self.mappings,
-313            "best_map_fields": self.best_map_fields,
-314            "best_embed_fields": self.best_embed_fields,
-315            "sensible": self.sensible,
-316            "sensible_embed": self.sensible_embed,
-317            "sensible_embed_safe": self.sensible_embed_safe,
-318        }
-319
-320    @apply_config
-321    def generate_query(self, topic_num, query_field_usage, **kwargs) -> Dict:
-322        """
-323        Generates a query for the clinical trials index
-324
-325        :param topic_num: Topic number to search
-326        :param query_field_usage: Which document facets to search over
-327        :param kwargs:
-328        :return:
-329            A basic elasticsearch query for clinical trials
-330        """
-331        fields = self.field_usage[query_field_usage]
-332        should = {"should": []}
-333
-334        qfield = list(self.topics[topic_num].keys())[0]
-335        query = self.topics[topic_num][qfield]
-336
-337        for i, field in enumerate(fields):
-338            should["should"].append(
-339                {
-340                    "match": {
-341                        f"{field}": {
-342                            "query": query,
-343                        }
-344                    }
-345                }
-346            )
-347
-348        query = {
-349            "query": {
-350                "bool": should,
-351            }
-352        }
-353
-354        return query
-355
-356    def generate_query_ablation(self, topic_num, **kwargs):
-357        """
-358        Only search one document facet at a time
-359        :param topic_num:
-360        :param kwargs:
-361        :return:
-362        """
-363        query = {"query": {"match": {}}}
-364
-365        for field in self.fields:
-366            query["query"]["match"][self.mappings[field]] = ""
-367
-368        for qfield in self.fields:
-369            qfield = self.mappings[qfield]
-370            for field in self.topics[topic_num]:
-371                query["query"]["match"][qfield] += self.topics[topic_num][field]
-372
-373        return query
-374
-375    @apply_config
-376    def generate_query_embedding(
-377        self,
-378        topic_num,
-379        encoder,
-380        query_field_usage,
-381        embed_field_usage,
-382        cosine_weights: List[float] = None,
-383        query_weight: List[float] = None,
-384        norm_weight=2.15,
-385        ablations=False,
-386        automatic_scores=None,
-387        **kwargs,
-388    ):
-389        """
-390        Computes the NIR score for a given topic
-391
-392        Score = log(BM25)/log(norm_weight) + embedding_score
-393
-394        :param topic_num:
-395        :param encoder:
-396        :param query_field_usage:
-397        :param embed_field_usage:
-398        :param cosine_weights:
-399        :param query_weight:
-400        :param norm_weight:
-401        :param ablations:
-402        :param automatic_scores:
-403        :param kwargs:
-404        :return:
-405        """
-406        should = {"should": []}
-407
-408        assert norm_weight or automatic_scores
-409
-410        query_fields = self.field_usage[query_field_usage]
-411        embed_fields = self.field_usage[embed_field_usage]
-412
-413        qfield = list(self.topics[topic_num].keys())[0]
-414        query = self.topics[topic_num][qfield]
-415
-416        for i, field in enumerate(query_fields):
-417            should["should"].append(
-418                {
-419                    "match": {
-420                        f"{field}": {
-421                            "query": query,
-422                            "boost": query_weight[i] if query_weight else 1,
-423                        }
-424                    }
-425                }
-426            )
-427
-428        if automatic_scores is not None:
-429            norm_weight = get_z_value(
-430                cosine_ceiling=len(embed_fields) * len(query_fields),
-431                bm25_ceiling=automatic_scores[topic_num],
-432            )
-433
-434        params = {
-435            "weights": cosine_weights if cosine_weights else [1] * len(embed_fields),
-436            "q_eb": encoder.encode(self.topics[topic_num][qfield]),
-437            "offset": 1.0,
-438            "norm_weight": norm_weight,
-439            "disable_bm25": ablations,
-440        }
-441
-442        query = {
-443            "query": {
-444                "script_score": {
-445                    "query": {
-446                        "bool": should,
-447                    },
-448                    "script": generate_script(self.best_embed_fields, params=params),
-449                },
-450            }
-451        }
-452
-453        return query
-454
-455    def get_query_type(self, *args, **kwargs):
-456        return self.query_funcs[self.query_type](*args, **kwargs)
-457
-458    def get_id_mapping(self, hit):
-459        return hit[self.id_mapping]
-
- - -

Elasticsearch Query object for the Clinical Trials Index

-
- - -
- -
- - TrialsElasticsearchQuery(topics, query_type, config=None, *args, **kwargs) - - - -
- -
 72    def __init__(self, topics, query_type, config=None, *args, **kwargs):
- 73        super().__init__(topics, config, *args, **kwargs)
- 74        self.query_type = query_type
- 75        self.config = config
- 76        self.topics = topics
- 77        self.fields = []
- 78        self.mappings = [
- 79            "HasExpandedAccess",
- 80            "BriefSummary.Textblock",
- 81            "CompletionDate.Type",
- 82            "OversightInfo.Text",
- 83            "OverallContactBackup.PhoneExt",
- 84            "RemovedCountries.Text",
- 85            "SecondaryOutcome",
- 86            "Sponsors.LeadSponsor.Text",
- 87            "BriefTitle",
- 88            "IDInfo.NctID",
- 89            "IDInfo.SecondaryID",
- 90            "OverallContactBackup.Phone",
- 91            "Eligibility.StudyPop.Textblock",
- 92            "DetailedDescription.Textblock",
- 93            "Eligibility.MinimumAge",
- 94            "Sponsors.Collaborator",
- 95            "Reference",
- 96            "Eligibility.Criteria.Textblock",
- 97            "XMLName.Space",
- 98            "Rank",
- 99            "OverallStatus",
-100            "InterventionBrowse.Text",
-101            "Eligibility.Text",
-102            "Intervention",
-103            "BiospecDescr.Textblock",
-104            "ResponsibleParty.NameTitle",
-105            "NumberOfArms",
-106            "ResponsibleParty.ResponsiblePartyType",
-107            "IsSection801",
-108            "Acronym",
-109            "Eligibility.MaximumAge",
-110            "DetailedDescription.Text",
-111            "StudyDesign",
-112            "OtherOutcome",
-113            "VerificationDate",
-114            "ConditionBrowse.MeshTerm",
-115            "Enrollment.Text",
-116            "IDInfo.Text",
-117            "ConditionBrowse.Text",
-118            "FirstreceivedDate",
-119            "NumberOfGroups",
-120            "OversightInfo.HasDmc",
-121            "PrimaryCompletionDate.Text",
-122            "ResultsReference",
-123            "Eligibility.StudyPop.Text",
-124            "IsFdaRegulated",
-125            "WhyStopped",
-126            "ArmGroup",
-127            "OverallContact.LastName",
-128            "Phase",
-129            "RemovedCountries.Country",
-130            "InterventionBrowse.MeshTerm",
-131            "Eligibility.HealthyVolunteers",
-132            "Location",
-133            "OfficialTitle",
-134            "OverallContact.Email",
-135            "RequiredHeader.Text",
-136            "RequiredHeader.URL",
-137            "LocationCountries.Country",
-138            "OverallContact.PhoneExt",
-139            "Condition",
-140            "PrimaryOutcome",
-141            "LocationCountries.Text",
-142            "BiospecDescr.Text",
-143            "IDInfo.OrgStudyID",
-144            "Link",
-145            "OverallContact.Phone",
-146            "Source",
-147            "ResponsibleParty.InvestigatorAffiliation",
-148            "StudyType",
-149            "FirstreceivedResultsDate",
-150            "Enrollment.Type",
-151            "Eligibility.Gender",
-152            "OverallContactBackup.LastName",
-153            "Keyword",
-154            "BiospecRetention",
-155            "CompletionDate.Text",
-156            "OverallContact.Text",
-157            "RequiredHeader.DownloadDate",
-158            "Sponsors.Text",
-159            "Text",
-160            "Eligibility.SamplingMethod",
-161            "LastchangedDate",
-162            "ResponsibleParty.InvestigatorFullName",
-163            "StartDate",
-164            "RequiredHeader.LinkText",
-165            "OverallOfficial",
-166            "Sponsors.LeadSponsor.AgencyClass",
-167            "OverallContactBackup.Text",
-168            "Eligibility.Criteria.Text",
-169            "XMLName.Local",
-170            "OversightInfo.Authority",
-171            "PrimaryCompletionDate.Type",
-172            "ResponsibleParty.Organization",
-173            "IDInfo.NctAlias",
-174            "ResponsibleParty.Text",
-175            "TargetDuration",
-176            "Sponsors.LeadSponsor.Agency",
-177            "BriefSummary.Text",
-178            "OverallContactBackup.Email",
-179            "ResponsibleParty.InvestigatorTitle",
-180        ]
-181
-182        self.best_recall_fields = [
-183            "LocationCountries.Country",
-184            "BiospecRetention",
-185            "DetailedDescription.Textblock",
-186            "HasExpandedAccess",
-187            "ConditionBrowse.MeshTerm",
-188            "RequiredHeader.LinkText",
-189            "WhyStopped",
-190            "BriefSummary.Textblock",
-191            "Eligibility.Criteria.Textblock",
-192            "OfficialTitle",
-193            "Eligibility.MaximumAge",
-194            "Eligibility.StudyPop.Textblock",
-195            "BiospecDescr.Textblock",
-196            "BriefTitle",
-197            "Eligibility.MinimumAge",
-198            "ResponsibleParty.Organization",
-199            "TargetDuration",
-200            "Condition",
-201            "IDInfo.OrgStudyID",
-202            "Keyword",
-203            "Source",
-204            "Sponsors.LeadSponsor.Agency",
-205            "ResponsibleParty.InvestigatorAffiliation",
-206            "OversightInfo.Authority",
-207            "OversightInfo.HasDmc",
-208            "OverallContact.Phone",
-209            "Phase",
-210            "OverallContactBackup.LastName",
-211            "Acronym",
-212            "InterventionBrowse.MeshTerm",
-213            "RemovedCountries.Country",
-214        ]
-215        self.best_map_fields = [
-216            "Eligibility.Gender",
-217            "LocationCountries.Country",
-218            "DetailedDescription.Textblock",
-219            "BriefSummary.Textblock",
-220            "ConditionBrowse.MeshTerm",
-221            "Eligibility.Criteria.Textblock",
-222            "InterventionBrowse.MeshTerm",
-223            "StudyType",
-224            "IsFdaRegulated",
-225            "HasExpandedAccess",
-226            "RequiredHeader.LinkText",
-227            "BiospecRetention",
-228            "OfficialTitle",
-229            "Eligibility.SamplingMethod",
-230            "Eligibility.StudyPop.Textblock",
-231            "Condition",
-232            "Eligibility.MinimumAge",
-233            "Keyword",
-234            "Eligibility.MaximumAge",
-235            "BriefTitle",
-236        ]
-237        self.best_embed_fields = [
-238            "WhyStopped",
-239            "HasExpandedAccess",
-240            "BiospecRetention",
-241            "BriefSummary.Textblock",
-242            "LocationCountries.Country",
-243            "ConditionBrowse.MeshTerm",
-244            "DetailedDescription.Textblock",
-245            "RequiredHeader.LinkText",
-246            "Eligibility.Criteria.Textblock",
-247        ]
-248
-249        self.sensible = [
-250            "BriefSummary.Textblock" "BriefTitle",
-251            "Eligibility.StudyPop.Textblock",
-252            "DetailedDescription.Textblock",
-253            "Eligibility.MinimumAge",
-254            "Eligibility.Criteria.Textblock",
-255            "InterventionBrowse.Text",
-256            "Eligibility.Text",
-257            "BiospecDescr.Textblock",
-258            "Eligibility.MaximumAge",
-259            "DetailedDescription.Text",
-260            "ConditionBrowse.MeshTerm",
-261            "ConditionBrowse.Text",
-262            "Eligibility.StudyPop.Text",
-263            "InterventionBrowse.MeshTerm",
-264            "OfficialTitle",
-265            "Condition",
-266            "PrimaryOutcome",
-267            "BiospecDescr.Text",
-268            "Eligibility.Gender",
-269            "Keyword",
-270            "BiospecRetention",
-271            "Eligibility.Criteria.Text",
-272            "BriefSummary.Text",
-273        ]
-274
-275        self.sensible_embed = [
-276            "BriefSummary.Textblock" "BriefTitle",
-277            "Eligibility.StudyPop.Textblock",
-278            "DetailedDescription.Textblock",
-279            "Eligibility.Criteria.Textblock",
-280            "InterventionBrowse.Text",
-281            "Eligibility.Text",
-282            "BiospecDescr.Textblock",
-283            "DetailedDescription.Text",
-284            "ConditionBrowse.MeshTerm",
-285            "ConditionBrowse.Text",
-286            "Eligibility.StudyPop.Text",
-287            "InterventionBrowse.MeshTerm",
-288            "OfficialTitle",
-289            "Condition",
-290            "PrimaryOutcome",
-291            "BiospecDescr.Text",
-292            "Keyword",
-293            "BiospecRetention",
-294            "Eligibility.Criteria.Text",
-295            "BriefSummary.Text",
-296        ]
-297
-298        self.sensible_embed_safe = list(
-299            set(self.best_recall_fields).intersection(set(self.sensible_embed))
-300        )
-301
-302        self.query_funcs = {
-303            "query": self.generate_query,
-304            "ablation": self.generate_query_ablation,
-305            "embedding": self.generate_query_embedding,
-306        }
-307
-308        loguru.logger.debug(self.sensible_embed_safe)
-309
-310        self.field_usage = {
-311            "best_recall_fields": self.best_recall_fields,
-312            "all": self.mappings,
-313            "best_map_fields": self.best_map_fields,
-314            "best_embed_fields": self.best_embed_fields,
-315            "sensible": self.sensible,
-316            "sensible_embed": self.sensible_embed,
-317            "sensible_embed_safe": self.sensible_embed_safe,
-318        }
-
- - - - -
-
- -
- - def - generate_query(self, *args, **kwargs): - - - -
- -
231    def use_config(self, *args, **kwargs):
-232        """
-233        Replaces keywords and args passed to the function with ones from self.config.
-234
-235        :param self:
-236        :param args: To be updated
-237        :param kwargs: To be updated
-238        :return:
-239        """
-240        if self.config is not None:
-241            kwargs = self.config.__update__(**kwargs)
-242
-243        return func(self, *args, **kwargs)
-
- - -

Generates a query for the clinical trials index

- -
Parameters
- -
    -
  • topic_num: Topic number to search
  • -
  • query_field_usage: Which document facets to search over
  • -
  • kwargs:
  • -
- -
Returns
- -
-
A basic elasticsearch query for clinical trials
-
-
-
- - -
-
- -
- - def - generate_query_ablation(self, topic_num, **kwargs): - - - -
- -
356    def generate_query_ablation(self, topic_num, **kwargs):
-357        """
-358        Only search one document facet at a time
-359        :param topic_num:
-360        :param kwargs:
-361        :return:
-362        """
-363        query = {"query": {"match": {}}}
-364
-365        for field in self.fields:
-366            query["query"]["match"][self.mappings[field]] = ""
-367
-368        for qfield in self.fields:
-369            qfield = self.mappings[qfield]
-370            for field in self.topics[topic_num]:
-371                query["query"]["match"][qfield] += self.topics[topic_num][field]
-372
-373        return query
-
- - -

Only search one document facet at a time

- -
Parameters
- -
    -
  • topic_num:
  • -
  • kwargs:
  • -
- -
Returns
-
- - -
-
- -
- - def - generate_query_embedding(self, *args, **kwargs): - - - -
- -
231    def use_config(self, *args, **kwargs):
-232        """
-233        Replaces keywords and args passed to the function with ones from self.config.
-234
-235        :param self:
-236        :param args: To be updated
-237        :param kwargs: To be updated
-238        :return:
-239        """
-240        if self.config is not None:
-241            kwargs = self.config.__update__(**kwargs)
-242
-243        return func(self, *args, **kwargs)
-
- - -

Computes the NIR score for a given topic

- -

Score = log(BM25)/log(norm_weight) + embedding_score

- -
Parameters
- -
    -
  • topic_num:
  • -
  • encoder:
  • -
  • query_field_usage:
  • -
  • embed_field_usage:
  • -
  • cosine_weights:
  • -
  • query_weight:
  • -
  • norm_weight:
  • -
  • ablations:
  • -
  • automatic_scores:
  • -
  • kwargs:
  • -
- -
Returns
-
- - -
-
- -
- - def - get_query_type(self, *args, **kwargs): - - - -
- -
455    def get_query_type(self, *args, **kwargs):
-456        return self.query_funcs[self.query_type](*args, **kwargs)
-
- - - - -
-
- -
- - def - get_id_mapping(self, hit): - - - -
- -
458    def get_id_mapping(self, hit):
-459        return hit[self.id_mapping]
-
- - -

Get the document ID

- -
Parameters
- -
    -
  • hit: The raw document result
  • -
- -
Returns
- -
-
The document's ID
-
-
-
- - -
- -
-
- -
- - class - ClinicalTrialsElasticsearchExecutor(debeir.interfaces.executor.GenericElasticsearchExecutor): - - - -
- -
462class ClinicalTrialsElasticsearchExecutor(GenericElasticsearchExecutor):
-463    """
-464    Executes queries given a query object.
-465    """
-466    query: TrialsElasticsearchQuery
-467
-468    def __init__(
-469        self,
-470        topics: Dict[Union[str, int], Dict[str, str]],
-471        client: Elasticsearch,
-472        index_name: str,
-473        output_file: str,
-474        query: TrialsElasticsearchQuery,
-475        encoder: Optional[Encoder] = None,
-476        config=None,
-477        *args,
-478        **kwargs,
-479    ):
-480
-481        super().__init__(
-482            topics,
-483            client,
-484            index_name,
-485            output_file,
-486            query,
-487            encoder,
-488            config=config,
-489            *args,
-490            **kwargs,
-491        )
-492
-493        self.query_fns = {
-494            "query": self.generate_query,
-495            "ablation": self.generate_query_ablation,
-496            "embedding": self.generate_embedding_query,
-497        }
-
- - -

Executes queries given a query object.

-
- - -
- -
- - ClinicalTrialsElasticsearchExecutor( topics: Dict[Union[str, int], Dict[str, str]], client: elasticsearch.AsyncElasticsearch, index_name: str, output_file: str, query: debeir.data_sets.clinical_trials.TrialsElasticsearchQuery, encoder: Optional[debeir.rankers.transformer_sent_encoder.Encoder] = None, config=None, *args, **kwargs) - - - -
- -
468    def __init__(
-469        self,
-470        topics: Dict[Union[str, int], Dict[str, str]],
-471        client: Elasticsearch,
-472        index_name: str,
-473        output_file: str,
-474        query: TrialsElasticsearchQuery,
-475        encoder: Optional[Encoder] = None,
-476        config=None,
-477        *args,
-478        **kwargs,
-479    ):
-480
-481        super().__init__(
-482            topics,
-483            client,
-484            index_name,
-485            output_file,
-486            query,
-487            encoder,
-488            config=config,
-489            *args,
-490            **kwargs,
-491        )
-492
-493        self.query_fns = {
-494            "query": self.generate_query,
-495            "ablation": self.generate_query_ablation,
-496            "embedding": self.generate_embedding_query,
-497        }
-
- - - - -
- -
-
- -
- - class - ClinicalTrialParser(debeir.interfaces.parser.Parser): - - - -
- -
500class ClinicalTrialParser(Parser):
-501    """
-502    Parser for Clinical Trials topics
-503    """
-504    @classmethod
-505    def get_topics(cls, csvfile) -> Dict[int, Dict[str, str]]:
-506        topics = {}
-507        reader = csv.reader(csvfile)
-508        for i, row in enumerate(reader):
-509            if i == 0:
-510                continue
-511
-512            _id = row[0]
-513            text = row[1]
-514
-515            topics[_id] = {"text": text}
-516
-517        return topics
-
- - -

Parser for Clinical Trials topics

-
- - -
-
- - ClinicalTrialParser() - - -
- - - - -
-
- -
-
@classmethod
- - def - get_topics(cls, csvfile) -> Dict[int, Dict[str, str]]: - - - -
- -
504    @classmethod
-505    def get_topics(cls, csvfile) -> Dict[int, Dict[str, str]]:
-506        topics = {}
-507        reader = csv.reader(csvfile)
-508        for i, row in enumerate(reader):
-509            if i == 0:
-510                continue
-511
-512            _id = row[0]
-513            text = row[1]
-514
-515            topics[_id] = {"text": text}
-516
-517        return topics
-
- - -

Instance method for getting topics, forwards instance self parameters to the _get_topics class method.

-
- - -
-
-
Inherited Members
-
- -
-
-
-
- - \ No newline at end of file diff --git a/docs/debeir/data_sets/factory.html b/docs/debeir/data_sets/factory.html deleted file mode 100644 index 665b6ea..0000000 --- a/docs/debeir/data_sets/factory.html +++ /dev/null @@ -1,683 +0,0 @@ - - - - - - - debeir.data_sets.factory API documentation - - - - - - - - - -
-
-

-debeir.data_sets.factory

- - - - - - -
  1from pathlib import Path
-  2from typing import Dict, Type, Union
-  3
-  4import toml
-  5
-  6from debeir.evaluation.evaluator import Evaluator
-  7from debeir.evaluation.residual_scoring import ResidualEvaluator
-  8from debeir.data_sets.trec_clinical_trials import TrecClincialElasticsearchQuery, TrecClinicalTrialsParser
-  9from debeir.interfaces.config import GenericConfig, _NIRMasterConfig, SolrConfig, ElasticsearchConfig, MetricsConfig, \
- 10    NIRConfig, Config
- 11from debeir.interfaces.query import GenericElasticsearchQuery, Query
- 12from debeir.data_sets.clinical_trials import TrialsElasticsearchQuery
- 13from debeir.data_sets.trec_covid import TrecElasticsearchQuery
- 14
- 15from debeir.data_sets.clinical_trials import (
- 16    ClinicalTrialsElasticsearchExecutor,
- 17    ClinicalTrialParser,
- 18    TrialsQueryConfig,
- 19)
- 20from debeir.data_sets.marco import MarcoElasticsearchExecutor, MarcoQueryConfig
- 21from debeir.interfaces.executor import GenericElasticsearchExecutor
- 22from debeir.interfaces.parser import (
- 23    CSVParser, Parser, TSVParser,
- 24)
- 25from debeir.data_sets.bioreddit import BioRedditSubmissionParser, BioRedditCommentParser
- 26from debeir.data_sets.trec_covid import TrecCovidParser
- 27
- 28str_to_config_cls = {
- 29    "clinical_trials": TrialsQueryConfig,
- 30    "test_trials": TrialsQueryConfig,
- 31    "med-marco": MarcoQueryConfig,
- 32    "generic": MarcoQueryConfig,
- 33}
- 34
- 35query_factory = {
- 36    "clinical_trials": TrialsElasticsearchQuery,
- 37    "test_trials": TrialsElasticsearchQuery,
- 38    "generic": GenericElasticsearchQuery,
- 39    "trec_covid": TrecElasticsearchQuery,
- 40    "trec_clinical": TrecClincialElasticsearchQuery,
- 41}
- 42
- 43parser_factory = {
- 44    "trec_covid": TrecCovidParser,
- 45    "bioreddit-comment": BioRedditCommentParser,
- 46    "bioreddit-submission": BioRedditSubmissionParser,
- 47    "test_trials": ClinicalTrialParser,
- 48    "med-marco": CSVParser,
- 49    "tsv": TSVParser,
- 50    "trec_clinical": TrecClinicalTrialsParser
- 51}
- 52
- 53executor_factory = {
- 54    "clinical": ClinicalTrialsElasticsearchExecutor,
- 55    "med-marco": MarcoElasticsearchExecutor,
- 56    "generic": GenericElasticsearchExecutor,
- 57}
- 58
- 59evaluator_factory = {
- 60    "residual": ResidualEvaluator,
- 61    "trec": Evaluator,
- 62}
- 63
- 64
- 65def get_index_name(config_fp):
- 66    """
- 67    Get the index name from the config without parsing as a TOML
- 68
- 69    :param config_fp:
- 70    :return:
- 71    """
- 72    with open(config_fp, "r") as reader:
- 73        for line in reader:
- 74            if line.startswith("index"):
- 75                line = line.replace('"', "")
- 76                return line.split("=")[-1].strip()
- 77    return None
- 78
- 79
- 80def factory_fn(config_fp, index=None) -> (Query, GenericConfig,
- 81                                          Parser, GenericElasticsearchExecutor, Evaluator):
- 82    """
- 83    Factory method for creating the parsed topics, config object, query object and query executor object
- 84
- 85    :param config_fp: Config file path
- 86    :param index: Index to search
- 87    :return:
- 88        Query, Config, Parser, Executor, Evaluator
- 89    """
- 90    config = config_factory(config_fp)
- 91    assert config.index is not None
- 92    query_cls = query_factory[config.query_fn]
- 93    parser = parser_factory[config.parser_fn]
- 94    executor = executor_factory[config.executor_fn]
- 95
- 96    return query_cls, config, parser, executor
- 97
- 98
- 99def config_factory(path: Union[str, Path] = None, config_cls: Type[Config] = None, args_dict: Dict = None):
-100    """
-101    Factory method for creating configs
-102
-103    :param path: Config path
-104    :param config_cls: Config class to instantiate
-105    :param args_dict: Arguments to consider
-106    :return:
-107        A config object
-108    """
-109    if path:
-110        args_dict = toml.load(path)
-111
-112    if not config_cls:
-113        if "config_fn" in args_dict:
-114            config_cls = str_to_config_cls[args_dict["config_fn"]]
-115        else:
-116            raise NotImplementedError()
-117
-118    return config_cls.from_args(args_dict, config_cls)
-119
-120
-121def get_nir_config(nir_config, *args, ignore_errors=False, **kwargs):
-122    main_config = config_factory(nir_config, config_cls=_NIRMasterConfig)
-123    search_engine_config = None
-124
-125    supported_search_engines = {"solr": SolrConfig,
-126                                "elasticsearch": ElasticsearchConfig}
-127
-128    search_engine_config = None
-129
-130    if 'engine' in kwargs and kwargs['engine'] in supported_search_engines:
-131        search_engine = kwargs['engine']
-132        search_engine_config = config_factory(args_dict=main_config.get_search_engine_settings(search_engine),
-133                                              config_cls=supported_search_engines[search_engine])
-134
-135    #for search_engine in supported_search_engines:
-136    #    if search_engine in kwargs and kwargs[search_engine] and kwargs['engine'] == search_engine:
-137    #        search_engine_config = config_factory(args_dict=main_config.get_search_engine_settings(search_engine),
-138    #                                              config_cls=supported_search_engines[search_engine])
-139
-140    if not ignore_errors and search_engine_config is None:
-141        raise RuntimeError("Unable to get a search engine configuration.")
-142
-143    metrics_config = config_factory(args_dict=main_config.get_metrics(), config_cls=MetricsConfig)
-144    nir_config = config_factory(args_dict=main_config.get_nir_settings(), config_cls=NIRConfig)
-145
-146    return nir_config, search_engine_config, metrics_config
-147
-148
-149def apply_nir_config(func):
-150    """
-151    Decorator that applies the NIR config settings to the current function
-152    Replaces arguments and keywords arguments with those found in the config
-153
-154    :param func:
-155    :return:
-156    """
-157
-158    def parse_nir_config(*args, ignore_errors=False, **kwargs):
-159        """
-160        Parses the NIR config for the different setting groups: Search Engine, Metrics and NIR settings
-161        Applies these settings to the current function
-162        :param ignore_errors:
-163        :param args:
-164        :param kwargs:
-165        :return:
-166        """
-167
-168        nir_config, search_engine_config, metrics_config = get_nir_config(*args,
-169                                                                          ignore_errors,
-170                                                                          **kwargs)
-171
-172        kwargs = nir_config.__update__(
-173            **search_engine_config.__update__(
-174                **metrics_config.__update__(**kwargs)
-175            )
-176        )
-177
-178        return func(*args, **kwargs)
-179
-180    return parse_nir_config
-
- - -
-
- -
- - def - get_index_name(config_fp): - - - -
- -
66def get_index_name(config_fp):
-67    """
-68    Get the index name from the config without parsing as a TOML
-69
-70    :param config_fp:
-71    :return:
-72    """
-73    with open(config_fp, "r") as reader:
-74        for line in reader:
-75            if line.startswith("index"):
-76                line = line.replace('"', "")
-77                return line.split("=")[-1].strip()
-78    return None
-
- - -

Get the index name from the config without parsing as a TOML

- -
Parameters
- -
    -
  • config_fp:
  • -
- -
Returns
-
- - -
-
- -
- - def - factory_fn( config_fp, index=None) -> (<class 'debeir.interfaces.query.Query'>, <class 'debeir.interfaces.config.GenericConfig'>, <class 'debeir.interfaces.parser.Parser'>, <class 'debeir.interfaces.executor.GenericElasticsearchExecutor'>, <class 'debeir.evaluation.evaluator.Evaluator'>): - - - -
- -
81def factory_fn(config_fp, index=None) -> (Query, GenericConfig,
-82                                          Parser, GenericElasticsearchExecutor, Evaluator):
-83    """
-84    Factory method for creating the parsed topics, config object, query object and query executor object
-85
-86    :param config_fp: Config file path
-87    :param index: Index to search
-88    :return:
-89        Query, Config, Parser, Executor, Evaluator
-90    """
-91    config = config_factory(config_fp)
-92    assert config.index is not None
-93    query_cls = query_factory[config.query_fn]
-94    parser = parser_factory[config.parser_fn]
-95    executor = executor_factory[config.executor_fn]
-96
-97    return query_cls, config, parser, executor
-
- - -

Factory method for creating the parsed topics, config object, query object and query executor object

- -
Parameters
- -
    -
  • config_fp: Config file path
  • -
  • index: Index to search
  • -
- -
Returns
- -
-
Query, Config, Parser, Executor, Evaluator
-
-
-
- - -
-
- -
- - def - config_factory( path: Union[str, pathlib.Path] = None, config_cls: Type[debeir.interfaces.config.Config] = None, args_dict: Dict = None): - - - -
- -
100def config_factory(path: Union[str, Path] = None, config_cls: Type[Config] = None, args_dict: Dict = None):
-101    """
-102    Factory method for creating configs
-103
-104    :param path: Config path
-105    :param config_cls: Config class to instantiate
-106    :param args_dict: Arguments to consider
-107    :return:
-108        A config object
-109    """
-110    if path:
-111        args_dict = toml.load(path)
-112
-113    if not config_cls:
-114        if "config_fn" in args_dict:
-115            config_cls = str_to_config_cls[args_dict["config_fn"]]
-116        else:
-117            raise NotImplementedError()
-118
-119    return config_cls.from_args(args_dict, config_cls)
-
- - -

Factory method for creating configs

- -
Parameters
- -
    -
  • path: Config path
  • -
  • config_cls: Config class to instantiate
  • -
  • args_dict: Arguments to consider
  • -
- -
Returns
- -
-
A config object
-
-
-
- - -
-
- -
- - def - get_nir_config(nir_config, *args, ignore_errors=False, **kwargs): - - - -
- -
122def get_nir_config(nir_config, *args, ignore_errors=False, **kwargs):
-123    main_config = config_factory(nir_config, config_cls=_NIRMasterConfig)
-124    search_engine_config = None
-125
-126    supported_search_engines = {"solr": SolrConfig,
-127                                "elasticsearch": ElasticsearchConfig}
-128
-129    search_engine_config = None
-130
-131    if 'engine' in kwargs and kwargs['engine'] in supported_search_engines:
-132        search_engine = kwargs['engine']
-133        search_engine_config = config_factory(args_dict=main_config.get_search_engine_settings(search_engine),
-134                                              config_cls=supported_search_engines[search_engine])
-135
-136    #for search_engine in supported_search_engines:
-137    #    if search_engine in kwargs and kwargs[search_engine] and kwargs['engine'] == search_engine:
-138    #        search_engine_config = config_factory(args_dict=main_config.get_search_engine_settings(search_engine),
-139    #                                              config_cls=supported_search_engines[search_engine])
-140
-141    if not ignore_errors and search_engine_config is None:
-142        raise RuntimeError("Unable to get a search engine configuration.")
-143
-144    metrics_config = config_factory(args_dict=main_config.get_metrics(), config_cls=MetricsConfig)
-145    nir_config = config_factory(args_dict=main_config.get_nir_settings(), config_cls=NIRConfig)
-146
-147    return nir_config, search_engine_config, metrics_config
-
- - - - -
-
- -
- - def - apply_nir_config(func): - - - -
- -
150def apply_nir_config(func):
-151    """
-152    Decorator that applies the NIR config settings to the current function
-153    Replaces arguments and keywords arguments with those found in the config
-154
-155    :param func:
-156    :return:
-157    """
-158
-159    def parse_nir_config(*args, ignore_errors=False, **kwargs):
-160        """
-161        Parses the NIR config for the different setting groups: Search Engine, Metrics and NIR settings
-162        Applies these settings to the current function
-163        :param ignore_errors:
-164        :param args:
-165        :param kwargs:
-166        :return:
-167        """
-168
-169        nir_config, search_engine_config, metrics_config = get_nir_config(*args,
-170                                                                          ignore_errors,
-171                                                                          **kwargs)
-172
-173        kwargs = nir_config.__update__(
-174            **search_engine_config.__update__(
-175                **metrics_config.__update__(**kwargs)
-176            )
-177        )
-178
-179        return func(*args, **kwargs)
-180
-181    return parse_nir_config
-
- - -

Decorator that applies the NIR config settings to the current function -Replaces arguments and keywords arguments with those found in the config

- -
Parameters
- -
    -
  • func:
  • -
- -
Returns
-
- - -
-
- - \ No newline at end of file diff --git a/docs/debeir/data_sets/marco.html b/docs/debeir/data_sets/marco.html deleted file mode 100644 index 3b0197f..0000000 --- a/docs/debeir/data_sets/marco.html +++ /dev/null @@ -1,771 +0,0 @@ - - - - - - - debeir.data_sets.marco API documentation - - - - - - - - - -
-
-

-debeir.data_sets.marco

- - - - - - -
 1from dataclasses import dataclass
- 2from typing import Dict, Union, Optional
- 3
- 4from elasticsearch import AsyncElasticsearch as Elasticsearch
- 5
- 6from debeir.interfaces.config import GenericConfig
- 7from debeir.interfaces.executor import GenericElasticsearchExecutor
- 8from debeir.interfaces.query import GenericElasticsearchQuery
- 9from debeir.rankers.transformer_sent_encoder import Encoder
-10
-11
-12class MarcoElasticsearchExecutor(GenericElasticsearchExecutor):
-13    query: GenericElasticsearchQuery
-14
-15    def __init__(
-16        self,
-17        topics: Dict[Union[str, int], Dict[str, str]],
-18        client: Elasticsearch,
-19        index_name: str,
-20        output_file: str,
-21        query: GenericElasticsearchQuery,
-22        encoder: Optional[Encoder] = None,
-23        config=None,
-24        *args,
-25        **kwargs,
-26    ):
-27        super().__init__(
-28            topics,
-29            client,
-30            index_name,
-31            output_file,
-32            query,
-33            encoder,
-34            config=config,
-35            *args,
-36            **kwargs,
-37        )
-38
-39        self.query_fns = {
-40            "query": self.generate_query,
-41            "embedding": self.generate_embedding_query,
-42        }
-43
-44    def generate_query(self, topic_num, best_fields=True, **kwargs):
-45        return self.query.generate_query(topic_num)
-46
-47    def generate_embedding_query(
-48        self,
-49        topic_num,
-50        cosine_weights=None,
-51        query_weights=None,
-52        norm_weight=2.15,
-53        automatic_scores=None,
-54        **kwargs,
-55    ):
-56        return super().generate_embedding_query(
-57            topic_num,
-58            cosine_weights=cosine_weights,
-59            query_weights=query_weights,
-60            norm_weight=2.15,
-61            automatic_scores=None,
-62            **kwargs,
-63        )
-64
-65    async def execute_query(
-66        self, query=None, topic_num=None, ablation=False, query_type="query", **kwargs
-67    ):
-68        return super().execute_query(
-69            query, topic_num, ablation, query_type=query_type, **kwargs
-70        )
-71
-72
-73@dataclass(init=True, unsafe_hash=True)
-74class MarcoQueryConfig(GenericConfig):
-75    def validate(self):
-76        if self.query_type == "embedding":
-77            assert (
-78                self.encoder_fp and self.encoder
-79            ), "Must provide encoder path for embedding model"
-80            assert self.norm_weight is not None or self.automatic is not None, (
-81                "Norm weight be " "specified or be automatic"
-82            )
-83
-84    @classmethod
-85    def from_toml(cls, fp: str, *args, **kwargs) -> "MarcoQueryConfig":
-86        return super().from_toml(fp, cls, *args, **kwargs)
-87
-88    @classmethod
-89    def from_dict(cls, **kwargs) -> "MarcoQueryConfig":
-90        return super().from_dict(cls, **kwargs)
-
- - -
-
- -
- - class - MarcoElasticsearchExecutor(debeir.interfaces.executor.GenericElasticsearchExecutor): - - - -
- -
13class MarcoElasticsearchExecutor(GenericElasticsearchExecutor):
-14    query: GenericElasticsearchQuery
-15
-16    def __init__(
-17        self,
-18        topics: Dict[Union[str, int], Dict[str, str]],
-19        client: Elasticsearch,
-20        index_name: str,
-21        output_file: str,
-22        query: GenericElasticsearchQuery,
-23        encoder: Optional[Encoder] = None,
-24        config=None,
-25        *args,
-26        **kwargs,
-27    ):
-28        super().__init__(
-29            topics,
-30            client,
-31            index_name,
-32            output_file,
-33            query,
-34            encoder,
-35            config=config,
-36            *args,
-37            **kwargs,
-38        )
-39
-40        self.query_fns = {
-41            "query": self.generate_query,
-42            "embedding": self.generate_embedding_query,
-43        }
-44
-45    def generate_query(self, topic_num, best_fields=True, **kwargs):
-46        return self.query.generate_query(topic_num)
-47
-48    def generate_embedding_query(
-49        self,
-50        topic_num,
-51        cosine_weights=None,
-52        query_weights=None,
-53        norm_weight=2.15,
-54        automatic_scores=None,
-55        **kwargs,
-56    ):
-57        return super().generate_embedding_query(
-58            topic_num,
-59            cosine_weights=cosine_weights,
-60            query_weights=query_weights,
-61            norm_weight=2.15,
-62            automatic_scores=None,
-63            **kwargs,
-64        )
-65
-66    async def execute_query(
-67        self, query=None, topic_num=None, ablation=False, query_type="query", **kwargs
-68    ):
-69        return super().execute_query(
-70            query, topic_num, ablation, query_type=query_type, **kwargs
-71        )
-
- - -

Generic Executor class for Elasticsearch

-
- - -
- -
- - MarcoElasticsearchExecutor( topics: Dict[Union[str, int], Dict[str, str]], client: elasticsearch.AsyncElasticsearch, index_name: str, output_file: str, query: debeir.interfaces.query.GenericElasticsearchQuery, encoder: Optional[debeir.rankers.transformer_sent_encoder.Encoder] = None, config=None, *args, **kwargs) - - - -
- -
16    def __init__(
-17        self,
-18        topics: Dict[Union[str, int], Dict[str, str]],
-19        client: Elasticsearch,
-20        index_name: str,
-21        output_file: str,
-22        query: GenericElasticsearchQuery,
-23        encoder: Optional[Encoder] = None,
-24        config=None,
-25        *args,
-26        **kwargs,
-27    ):
-28        super().__init__(
-29            topics,
-30            client,
-31            index_name,
-32            output_file,
-33            query,
-34            encoder,
-35            config=config,
-36            *args,
-37            **kwargs,
-38        )
-39
-40        self.query_fns = {
-41            "query": self.generate_query,
-42            "embedding": self.generate_embedding_query,
-43        }
-
- - - - -
-
- -
- - def - generate_query(self, topic_num, best_fields=True, **kwargs): - - - -
- -
45    def generate_query(self, topic_num, best_fields=True, **kwargs):
-46        return self.query.generate_query(topic_num)
-
- - -

Generates a standard BM25 query given the topic number

- -
Parameters
- -
    -
  • topic_num: Query topic number to generate
  • -
  • best_fields: Whether to use a curated list of fields
  • -
  • kwargs:
  • -
- -
Returns
-
- - -
-
- -
- - def - generate_embedding_query( self, topic_num, cosine_weights=None, query_weights=None, norm_weight=2.15, automatic_scores=None, **kwargs): - - - -
- -
48    def generate_embedding_query(
-49        self,
-50        topic_num,
-51        cosine_weights=None,
-52        query_weights=None,
-53        norm_weight=2.15,
-54        automatic_scores=None,
-55        **kwargs,
-56    ):
-57        return super().generate_embedding_query(
-58            topic_num,
-59            cosine_weights=cosine_weights,
-60            query_weights=query_weights,
-61            norm_weight=2.15,
-62            automatic_scores=None,
-63            **kwargs,
-64        )
-
- - -

Executes an NIR-style query with combined scoring.

- -
Parameters
- -
    -
  • topic_num:
  • -
  • cosine_weights:
  • -
  • query_weights:
  • -
  • norm_weight:
  • -
  • automatic_scores:
  • -
  • kwargs:
  • -
- -
Returns
-
- - -
-
- -
- - async def - execute_query( self, query=None, topic_num=None, ablation=False, query_type='query', **kwargs): - - - -
- -
66    async def execute_query(
-67        self, query=None, topic_num=None, ablation=False, query_type="query", **kwargs
-68    ):
-69        return super().execute_query(
-70            query, topic_num, ablation, query_type=query_type, **kwargs
-71        )
-
- - -

Execute a query given parameters

- -
Parameters
- -
    -
  • args:
  • -
  • kwargs:
  • -
-
- - -
- -
-
- -
-
@dataclass(init=True, unsafe_hash=True)
- - class - MarcoQueryConfig(debeir.interfaces.config.GenericConfig): - - - -
- -
74@dataclass(init=True, unsafe_hash=True)
-75class MarcoQueryConfig(GenericConfig):
-76    def validate(self):
-77        if self.query_type == "embedding":
-78            assert (
-79                self.encoder_fp and self.encoder
-80            ), "Must provide encoder path for embedding model"
-81            assert self.norm_weight is not None or self.automatic is not None, (
-82                "Norm weight be " "specified or be automatic"
-83            )
-84
-85    @classmethod
-86    def from_toml(cls, fp: str, *args, **kwargs) -> "MarcoQueryConfig":
-87        return super().from_toml(fp, cls, *args, **kwargs)
-88
-89    @classmethod
-90    def from_dict(cls, **kwargs) -> "MarcoQueryConfig":
-91        return super().from_dict(cls, **kwargs)
-
- - - - -
-
- - MarcoQueryConfig( query_type: str, index: str = None, encoder_normalize: bool = True, ablations: bool = False, norm_weight: float = None, automatic: bool = None, encoder: object = None, encoder_fp: str = None, query_weights: List[float] = None, cosine_weights: List[float] = None, evaluate: bool = False, qrels: str = None, config_fn: str = None, query_fn: str = None, parser_fn: str = None, executor_fn: str = None, cosine_ceiling: float = None, topics_path: str = None, return_id_only: bool = False, overwrite_output_if_exists: bool = False, output_file: str = None, run_name: str = None) - - -
- - - - -
-
- -
- - def - validate(self): - - - -
- -
76    def validate(self):
-77        if self.query_type == "embedding":
-78            assert (
-79                self.encoder_fp and self.encoder
-80            ), "Must provide encoder path for embedding model"
-81            assert self.norm_weight is not None or self.automatic is not None, (
-82                "Norm weight be " "specified or be automatic"
-83            )
-
- - -

Validates if the config is correct. -Must be implemented by inherited classes.

-
- - -
-
- -
-
@classmethod
- - def - from_toml(cls, fp: str, *args, **kwargs) -> debeir.data_sets.marco.MarcoQueryConfig: - - - -
- -
85    @classmethod
-86    def from_toml(cls, fp: str, *args, **kwargs) -> "MarcoQueryConfig":
-87        return super().from_toml(fp, cls, *args, **kwargs)
-
- - -

Instantiates a Config object from a toml file

- -
Parameters
- -
    -
  • fp: File path of the Config TOML file
  • -
  • field_class: Class of the Config object to be instantiated
  • -
  • args: Arguments to be passed to Config
  • -
  • kwargs: Keyword arguments to be passed
  • -
- -
Returns
- -
-
A instantiated and validated Config object.
-
-
-
- - -
-
- -
-
@classmethod
- - def - from_dict(cls, **kwargs) -> debeir.data_sets.marco.MarcoQueryConfig: - - - -
- -
89    @classmethod
-90    def from_dict(cls, **kwargs) -> "MarcoQueryConfig":
-91        return super().from_dict(cls, **kwargs)
-
- - -

Instantiates a Config object from a dictionary

- -
Parameters
- -
    -
  • data_class:
  • -
  • kwargs:
  • -
- -
Returns
-
- - -
-
-
Inherited Members
-
- -
-
-
-
- - \ No newline at end of file diff --git a/docs/debeir/data_sets/trec_clinical_trials.html b/docs/debeir/data_sets/trec_clinical_trials.html deleted file mode 100644 index 94b5864..0000000 --- a/docs/debeir/data_sets/trec_clinical_trials.html +++ /dev/null @@ -1,611 +0,0 @@ - - - - - - - debeir.data_sets.trec_clinical_trials API documentation - - - - - - - - - -
-
-

-debeir.data_sets.trec_clinical_trials

- - - - - - -
  1import pathlib
-  2import xml.etree.ElementTree as ET
-  3from collections import defaultdict
-  4from typing import Dict, List
-  5
-  6import pandas as pd
-  7
-  8from debeir.interfaces.query import GenericElasticsearchQuery
-  9from debeir.interfaces.parser import XMLParser, JsonLinesParser
- 10
- 11
- 12class TREClinicalTrialDocumentParser(XMLParser):
- 13    """
- 14    Parser for Clinical Trials topics
- 15    """
- 16
- 17    parse_fields: List[str] = ["brief_title", "official_title",
- 18                               "brief_summary", "detailed_description",
- 19                               "eligibility", "condition_browse",
- 20                               "intervention_browse"]
- 21    topic_field_name: str
- 22    id_field: str
- 23
- 24    @classmethod
- 25    def extract(cls, path) -> Dict:
- 26        document = ET.parse(path).getroot()
- 27        document_dict = defaultdict(lambda: defaultdict(lambda: []))
- 28        document_dict['doc_id'] = pathlib.Path(path).parts[-1].strip(".xml")
- 29
- 30        for parse_field in cls.parse_fields:
- 31            node = document.find(parse_field)
- 32            nodes: List[ET.Element] = []
- 33
- 34            if node is not None:
- 35                cls._recurse_to_child_node(node, nodes)
- 36
- 37            if len(nodes) == 0 and node is not None:
- 38                document_dict[parse_field] = node.text
- 39
- 40            for node in nodes:
- 41                text = node.text.strip()
- 42
- 43                if not text:
- 44                    continue
- 45
- 46                if document_dict[parse_field][node.tag]:
- 47                    document_dict[parse_field][node.tag].append(text)
- 48                else:
- 49                    document_dict[parse_field][node.tag] = [text]
- 50
- 51            cls.unwrap(document_dict, parse_field)
- 52
- 53        document_dict = pd.io.json.json_normalize(document_dict,
- 54                                                  sep=".").to_dict(orient='records')[0]
- 55
- 56        return document_dict
- 57
- 58
- 59TrecClinicalTrialTripletParser = JsonLinesParser(
- 60    parse_fields=["q_text", "brief_title", "official_title",
- 61                  "brief_summary", "detailed_description", "rel"],
- 62    id_field="qid",
- 63    secondary_id="doc_id",
- 64    ignore_full_match=True
- 65)
- 66
- 67TrecClinicalTrialsParser = XMLParser(
- 68    parse_fields=None,
- 69    id_field="number",
- 70    topic_field_name="topic")
- 71
- 72
- 73class TrecClincialElasticsearchQuery(GenericElasticsearchQuery):
- 74    def __init__(self, topics, config, *args, **kwargs):
- 75        super().__init__(topics, config, *args, **kwargs)
- 76
- 77        #self.mappings = ['BriefTitle_Text',
- 78        #                 'BriefSummary_Text',
- 79        #                 'DetailedDescription_Text']
- 80
- 81        self.mappings = [
- 82            "BriefSummary_Text",
- 83            "BriefTitle_Text",
- 84            'DetailedDescription_Text',
- 85            'Eligibility.Criteria.Textblock'
- 86            'Eligibility.StudyPop.Textblock',
- 87            'ConditionBrowse.MeshTerm',
- 88            'InterventionBrowse.MeshTerm',
- 89            'Condition',
- 90            'Eligibility.Gender',
- 91            "OfficialTitle"]
- 92
- 93        self.topics = topics
- 94        self.config = config
- 95        self.query_type = self.config.query_type
- 96
- 97        self.embed_mappings = ['BriefTitle_Embedding',
- 98                               'BriefSummary_Embedding',
- 99                               'DetailedDescription_Embedding']
-100
-101        self.id_mapping = "docid"
-102
-103        self.query_funcs = {
-104            "query": self.generate_query,
-105            "embedding": self.generate_query_embedding,
-106        }
-
- - -
-
- -
- - class - TREClinicalTrialDocumentParser(debeir.interfaces.parser.XMLParser): - - - -
- -
13class TREClinicalTrialDocumentParser(XMLParser):
-14    """
-15    Parser for Clinical Trials topics
-16    """
-17
-18    parse_fields: List[str] = ["brief_title", "official_title",
-19                               "brief_summary", "detailed_description",
-20                               "eligibility", "condition_browse",
-21                               "intervention_browse"]
-22    topic_field_name: str
-23    id_field: str
-24
-25    @classmethod
-26    def extract(cls, path) -> Dict:
-27        document = ET.parse(path).getroot()
-28        document_dict = defaultdict(lambda: defaultdict(lambda: []))
-29        document_dict['doc_id'] = pathlib.Path(path).parts[-1].strip(".xml")
-30
-31        for parse_field in cls.parse_fields:
-32            node = document.find(parse_field)
-33            nodes: List[ET.Element] = []
-34
-35            if node is not None:
-36                cls._recurse_to_child_node(node, nodes)
-37
-38            if len(nodes) == 0 and node is not None:
-39                document_dict[parse_field] = node.text
-40
-41            for node in nodes:
-42                text = node.text.strip()
-43
-44                if not text:
-45                    continue
-46
-47                if document_dict[parse_field][node.tag]:
-48                    document_dict[parse_field][node.tag].append(text)
-49                else:
-50                    document_dict[parse_field][node.tag] = [text]
-51
-52            cls.unwrap(document_dict, parse_field)
-53
-54        document_dict = pd.io.json.json_normalize(document_dict,
-55                                                  sep=".").to_dict(orient='records')[0]
-56
-57        return document_dict
-
- - -

Parser for Clinical Trials topics

-
- - -
- -
-
@classmethod
- - def - extract(cls, path) -> Dict: - - - -
- -
25    @classmethod
-26    def extract(cls, path) -> Dict:
-27        document = ET.parse(path).getroot()
-28        document_dict = defaultdict(lambda: defaultdict(lambda: []))
-29        document_dict['doc_id'] = pathlib.Path(path).parts[-1].strip(".xml")
-30
-31        for parse_field in cls.parse_fields:
-32            node = document.find(parse_field)
-33            nodes: List[ET.Element] = []
-34
-35            if node is not None:
-36                cls._recurse_to_child_node(node, nodes)
-37
-38            if len(nodes) == 0 and node is not None:
-39                document_dict[parse_field] = node.text
-40
-41            for node in nodes:
-42                text = node.text.strip()
-43
-44                if not text:
-45                    continue
-46
-47                if document_dict[parse_field][node.tag]:
-48                    document_dict[parse_field][node.tag].append(text)
-49                else:
-50                    document_dict[parse_field][node.tag] = [text]
-51
-52            cls.unwrap(document_dict, parse_field)
-53
-54        document_dict = pd.io.json.json_normalize(document_dict,
-55                                                  sep=".").to_dict(orient='records')[0]
-56
-57        return document_dict
-
- - - - -
- -
-
- -
- - class - TrecClincialElasticsearchQuery(debeir.interfaces.query.GenericElasticsearchQuery): - - - -
- -
 74class TrecClincialElasticsearchQuery(GenericElasticsearchQuery):
- 75    def __init__(self, topics, config, *args, **kwargs):
- 76        super().__init__(topics, config, *args, **kwargs)
- 77
- 78        #self.mappings = ['BriefTitle_Text',
- 79        #                 'BriefSummary_Text',
- 80        #                 'DetailedDescription_Text']
- 81
- 82        self.mappings = [
- 83            "BriefSummary_Text",
- 84            "BriefTitle_Text",
- 85            'DetailedDescription_Text',
- 86            'Eligibility.Criteria.Textblock'
- 87            'Eligibility.StudyPop.Textblock',
- 88            'ConditionBrowse.MeshTerm',
- 89            'InterventionBrowse.MeshTerm',
- 90            'Condition',
- 91            'Eligibility.Gender',
- 92            "OfficialTitle"]
- 93
- 94        self.topics = topics
- 95        self.config = config
- 96        self.query_type = self.config.query_type
- 97
- 98        self.embed_mappings = ['BriefTitle_Embedding',
- 99                               'BriefSummary_Embedding',
-100                               'DetailedDescription_Embedding']
-101
-102        self.id_mapping = "docid"
-103
-104        self.query_funcs = {
-105            "query": self.generate_query,
-106            "embedding": self.generate_query_embedding,
-107        }
-
- - -

A generic elasticsearch query. Contains methods for NIR-style (embedding) queries and normal BM25 queries. -Requires topics, configs to be included

-
- - -
- -
- - TrecClincialElasticsearchQuery(topics, config, *args, **kwargs) - - - -
- -
 75    def __init__(self, topics, config, *args, **kwargs):
- 76        super().__init__(topics, config, *args, **kwargs)
- 77
- 78        #self.mappings = ['BriefTitle_Text',
- 79        #                 'BriefSummary_Text',
- 80        #                 'DetailedDescription_Text']
- 81
- 82        self.mappings = [
- 83            "BriefSummary_Text",
- 84            "BriefTitle_Text",
- 85            'DetailedDescription_Text',
- 86            'Eligibility.Criteria.Textblock'
- 87            'Eligibility.StudyPop.Textblock',
- 88            'ConditionBrowse.MeshTerm',
- 89            'InterventionBrowse.MeshTerm',
- 90            'Condition',
- 91            'Eligibility.Gender',
- 92            "OfficialTitle"]
- 93
- 94        self.topics = topics
- 95        self.config = config
- 96        self.query_type = self.config.query_type
- 97
- 98        self.embed_mappings = ['BriefTitle_Embedding',
- 99                               'BriefSummary_Embedding',
-100                               'DetailedDescription_Embedding']
-101
-102        self.id_mapping = "docid"
-103
-104        self.query_funcs = {
-105            "query": self.generate_query,
-106            "embedding": self.generate_query_embedding,
-107        }
-
- - - - -
- -
-
- - \ No newline at end of file diff --git a/docs/debeir/data_sets/trec_covid.html b/docs/debeir/data_sets/trec_covid.html deleted file mode 100644 index d590bfe..0000000 --- a/docs/debeir/data_sets/trec_covid.html +++ /dev/null @@ -1,453 +0,0 @@ - - - - - - - debeir.data_sets.trec_covid API documentation - - - - - - - - - -
-
-

-debeir.data_sets.trec_covid

- - - - - - -
 1from typing import Dict
- 2
- 3from debeir.interfaces.query import GenericElasticsearchQuery
- 4from debeir.interfaces.parser import XMLParser
- 5
- 6
- 7class TrecCovidParser(XMLParser):
- 8    parse_fields = ["query", "question", "narrative"]
- 9    topic_field_name = "topic"
-10    id_field = "number"
-11
-12    @classmethod
-13    def get_topics(cls, xmlfile) -> Dict[int, Dict[str, str]]:
-14        return super().get_topics(xmlfile)
-15
-16
-17class TrecElasticsearchQuery(GenericElasticsearchQuery):
-18    def __init__(self, topics, config, *args, **kwargs):
-19        super().__init__(topics, config, *args, **kwargs)
-20
-21        self.mappings = ["title", "abstract", "fulltext"]
-22
-23        self.topics = topics
-24        self.config = config
-25        self.query_type = self.config.query_type
-26
-27        self.embed_mappings = [
-28            "title_embedding",
-29            "abstract_embedding",
-30            "fulltext_embedding",
-31        ]
-32
-33        self.id_mapping = "id"
-34
-35        self.query_funcs = {
-36            "query": self.generate_query,
-37            "embedding": self.generate_query_embedding,
-38        }
-
- - -
-
- -
- - class - TrecCovidParser(debeir.interfaces.parser.XMLParser): - - - -
- -
 8class TrecCovidParser(XMLParser):
- 9    parse_fields = ["query", "question", "narrative"]
-10    topic_field_name = "topic"
-11    id_field = "number"
-12
-13    @classmethod
-14    def get_topics(cls, xmlfile) -> Dict[int, Dict[str, str]]:
-15        return super().get_topics(xmlfile)
-
- - -

Load topics from an XML file

-
- - -
- -
-
@classmethod
- - def - get_topics(cls, xmlfile) -> Dict[int, Dict[str, str]]: - - - -
- -
13    @classmethod
-14    def get_topics(cls, xmlfile) -> Dict[int, Dict[str, str]]:
-15        return super().get_topics(xmlfile)
-
- - -

Instance method for getting topics, forwards instance self parameters to the _get_topics class method.

-
- - -
- -
-
- -
- - class - TrecElasticsearchQuery(debeir.interfaces.query.GenericElasticsearchQuery): - - - -
- -
18class TrecElasticsearchQuery(GenericElasticsearchQuery):
-19    def __init__(self, topics, config, *args, **kwargs):
-20        super().__init__(topics, config, *args, **kwargs)
-21
-22        self.mappings = ["title", "abstract", "fulltext"]
-23
-24        self.topics = topics
-25        self.config = config
-26        self.query_type = self.config.query_type
-27
-28        self.embed_mappings = [
-29            "title_embedding",
-30            "abstract_embedding",
-31            "fulltext_embedding",
-32        ]
-33
-34        self.id_mapping = "id"
-35
-36        self.query_funcs = {
-37            "query": self.generate_query,
-38            "embedding": self.generate_query_embedding,
-39        }
-
- - -

A generic elasticsearch query. Contains methods for NIR-style (embedding) queries and normal BM25 queries. -Requires topics, configs to be included

-
- - -
- -
- - TrecElasticsearchQuery(topics, config, *args, **kwargs) - - - -
- -
19    def __init__(self, topics, config, *args, **kwargs):
-20        super().__init__(topics, config, *args, **kwargs)
-21
-22        self.mappings = ["title", "abstract", "fulltext"]
-23
-24        self.topics = topics
-25        self.config = config
-26        self.query_type = self.config.query_type
-27
-28        self.embed_mappings = [
-29            "title_embedding",
-30            "abstract_embedding",
-31            "fulltext_embedding",
-32        ]
-33
-34        self.id_mapping = "id"
-35
-36        self.query_funcs = {
-37            "query": self.generate_query,
-38            "embedding": self.generate_query_embedding,
-39        }
-
- - - - -
- -
-
- - \ No newline at end of file diff --git a/docs/debeir/data_sets/types.html b/docs/debeir/data_sets/types.html deleted file mode 100644 index 2f5ec9f..0000000 --- a/docs/debeir/data_sets/types.html +++ /dev/null @@ -1,731 +0,0 @@ - - - - - - - debeir.data_sets.types API documentation - - - - - - - - - -
-
-

-debeir.data_sets.types

- - - - - - -
 1import string
- 2from collections import defaultdict
- 3from enum import Enum
- 4from typing import List, Union
- 5
- 6
- 7class InputExample:
- 8    """
- 9    Copied from Sentence Transformer Library
-10    Structure for one input example with texts, the label and a unique id
-11    """
-12
-13    def __init__(self, guid: str = '', texts: List[str] = None, label: Union[int, float] = 0):
-14        """
-15        Creates one InputExample with the given texts, guid and label
-16
-17        :param guid
-18            id for the example
-19        :param texts
-20            the texts for the example. Note, str.strip() is called on the texts
-21        :param label
-22            the label for the example
-23        """
-24        self.guid = guid
-25        self.texts = [text.strip() for text in texts]
-26        self.label = label
-27
-28    def __str__(self):
-29        return "<InputExample> label: {}, texts: {}".format(str(self.label), "; ".join(self.texts))
-30
-31    def get_label(self):
-32        return self.label
-33
-34    #def __getattr__(self, key):
-35    #    if key == "label":
-36    #        return self.get_label()
-37
-38    #    if key == "texts":
-39    #        return self.texts
-40
-41    #    if key in ["guid", "id"]:
-42    #        return self.guid
-43
-44    #    raise KeyError()
-45
-46    @classmethod
-47    def to_dict(cls, data: List['InputExample']):
-48        text_len = len(data[0].texts)
-49        processed_data = defaultdict(lambda: [])
-50
-51        for datum in data:
-52            # string.ascii_lowercase
-53
-54            processed_data["id"].append(datum.guid)
-55            processed_data["label"].append(datum.get_label())
-56
-57            for i in range(text_len):
-58                letter = string.ascii_lowercase[i]  # abcdefghi
-59                # processed_data[text_a] = ...
-60                processed_data[f"text_{letter}"].append(datum.texts[i])
-61
-62        return processed_data
-63
-64    @classmethod
-65    def from_parser_output(cls, data):
-66        pass
-67
-68
-69class RelevanceExample(InputExample):
-70    """
-71    Converts Relevance Labels to 0 - 1
-72    """
-73
-74    def __init__(self, max_score=2, *args, **kwargs):
-75        super().__init__(*args, **kwargs)
-76        self.max_score = max_score
-77
-78    def get_label(self):
-79        return self.relevance()
-80
-81    def relevance(self):
-82        """
-83        :return:
-84            Returns a normalised score for relevance between 0 - 1
-85        """
-86        return self.label / self.max_score
-87
-88
-89class DatasetTypes(Enum):
-90    """
-91    A collection of common dataset types that is usable in the library.
-92    """
-93    List: "List"
-94    ListInputExample: "ListInputExample"
-95    ListDict: "ListDict"
-96    HuggingfaceDataset: "HuggingfaceDataset"
-
- - -
-
- -
- - class - InputExample: - - - -
- -
 8class InputExample:
- 9    """
-10    Copied from Sentence Transformer Library
-11    Structure for one input example with texts, the label and a unique id
-12    """
-13
-14    def __init__(self, guid: str = '', texts: List[str] = None, label: Union[int, float] = 0):
-15        """
-16        Creates one InputExample with the given texts, guid and label
-17
-18        :param guid
-19            id for the example
-20        :param texts
-21            the texts for the example. Note, str.strip() is called on the texts
-22        :param label
-23            the label for the example
-24        """
-25        self.guid = guid
-26        self.texts = [text.strip() for text in texts]
-27        self.label = label
-28
-29    def __str__(self):
-30        return "<InputExample> label: {}, texts: {}".format(str(self.label), "; ".join(self.texts))
-31
-32    def get_label(self):
-33        return self.label
-34
-35    #def __getattr__(self, key):
-36    #    if key == "label":
-37    #        return self.get_label()
-38
-39    #    if key == "texts":
-40    #        return self.texts
-41
-42    #    if key in ["guid", "id"]:
-43    #        return self.guid
-44
-45    #    raise KeyError()
-46
-47    @classmethod
-48    def to_dict(cls, data: List['InputExample']):
-49        text_len = len(data[0].texts)
-50        processed_data = defaultdict(lambda: [])
-51
-52        for datum in data:
-53            # string.ascii_lowercase
-54
-55            processed_data["id"].append(datum.guid)
-56            processed_data["label"].append(datum.get_label())
-57
-58            for i in range(text_len):
-59                letter = string.ascii_lowercase[i]  # abcdefghi
-60                # processed_data[text_a] = ...
-61                processed_data[f"text_{letter}"].append(datum.texts[i])
-62
-63        return processed_data
-64
-65    @classmethod
-66    def from_parser_output(cls, data):
-67        pass
-
- - -

Copied from Sentence Transformer Library -Structure for one input example with texts, the label and a unique id

-
- - -
- -
- - InputExample( guid: str = '', texts: List[str] = None, label: Union[int, float] = 0) - - - -
- -
14    def __init__(self, guid: str = '', texts: List[str] = None, label: Union[int, float] = 0):
-15        """
-16        Creates one InputExample with the given texts, guid and label
-17
-18        :param guid
-19            id for the example
-20        :param texts
-21            the texts for the example. Note, str.strip() is called on the texts
-22        :param label
-23            the label for the example
-24        """
-25        self.guid = guid
-26        self.texts = [text.strip() for text in texts]
-27        self.label = label
-
- - -

Creates one InputExample with the given texts, guid and label

- -

:param guid - id for the example -:param texts - the texts for the example. Note, str.strip() is called on the texts -:param label - the label for the example

-
- - -
-
- -
- - def - get_label(self): - - - -
- -
32    def get_label(self):
-33        return self.label
-
- - - - -
-
- -
-
@classmethod
- - def - to_dict(cls, data: List[debeir.data_sets.types.InputExample]): - - - -
- -
47    @classmethod
-48    def to_dict(cls, data: List['InputExample']):
-49        text_len = len(data[0].texts)
-50        processed_data = defaultdict(lambda: [])
-51
-52        for datum in data:
-53            # string.ascii_lowercase
-54
-55            processed_data["id"].append(datum.guid)
-56            processed_data["label"].append(datum.get_label())
-57
-58            for i in range(text_len):
-59                letter = string.ascii_lowercase[i]  # abcdefghi
-60                # processed_data[text_a] = ...
-61                processed_data[f"text_{letter}"].append(datum.texts[i])
-62
-63        return processed_data
-
- - - - -
-
- -
-
@classmethod
- - def - from_parser_output(cls, data): - - - -
- -
65    @classmethod
-66    def from_parser_output(cls, data):
-67        pass
-
- - - - -
-
-
- -
- - class - RelevanceExample(InputExample): - - - -
- -
70class RelevanceExample(InputExample):
-71    """
-72    Converts Relevance Labels to 0 - 1
-73    """
-74
-75    def __init__(self, max_score=2, *args, **kwargs):
-76        super().__init__(*args, **kwargs)
-77        self.max_score = max_score
-78
-79    def get_label(self):
-80        return self.relevance()
-81
-82    def relevance(self):
-83        """
-84        :return:
-85            Returns a normalised score for relevance between 0 - 1
-86        """
-87        return self.label / self.max_score
-
- - -

Converts Relevance Labels to 0 - 1

-
- - -
- -
- - RelevanceExample(max_score=2, *args, **kwargs) - - - -
- -
75    def __init__(self, max_score=2, *args, **kwargs):
-76        super().__init__(*args, **kwargs)
-77        self.max_score = max_score
-
- - -

Creates one InputExample with the given texts, guid and label

- -

:param guid - id for the example -:param texts - the texts for the example. Note, str.strip() is called on the texts -:param label - the label for the example

-
- - -
-
- -
- - def - get_label(self): - - - -
- -
79    def get_label(self):
-80        return self.relevance()
-
- - - - -
-
- -
- - def - relevance(self): - - - -
- -
82    def relevance(self):
-83        """
-84        :return:
-85            Returns a normalised score for relevance between 0 - 1
-86        """
-87        return self.label / self.max_score
-
- - -
Returns
- -
-
Returns a normalised score for relevance between 0 - 1
-
-
-
- - -
-
-
Inherited Members
-
- -
-
-
-
- -
- - class - DatasetTypes(enum.Enum): - - - -
- -
90class DatasetTypes(Enum):
-91    """
-92    A collection of common dataset types that is usable in the library.
-93    """
-94    List: "List"
-95    ListInputExample: "ListInputExample"
-96    ListDict: "ListDict"
-97    HuggingfaceDataset: "HuggingfaceDataset"
-
- - -

A collection of common dataset types that is usable in the library.

-
- - -
-
Inherited Members
-
-
enum.Enum
-
name
-
value
- -
-
-
-
-
- - \ No newline at end of file diff --git a/docs/debeir/data_sets/utils.html b/docs/debeir/data_sets/utils.html deleted file mode 100644 index 16d0d81..0000000 --- a/docs/debeir/data_sets/utils.html +++ /dev/null @@ -1,545 +0,0 @@ - - - - - - - debeir.data_sets.utils API documentation - - - - - - - - - -
-
-

-debeir.data_sets.utils

- - - - - - -
 1# TODO: Convert a Parser Return Dict (Dict[int, Dict[str, ...])
- 2
- 3import datasets
- 4
- 5from debeir.data_sets.types import InputExample
- 6from debeir.evaluation.cross_validation import CrossValidator
- 7from debeir.data_sets.types import DatasetTypes
- 8from debeir.evaluation.evaluator import Evaluator
- 9
-10
-11class CrossValidatorDataset:
-12    """
-13    Cross Validator Dataset
-14    """
-15    cross_val_cls: CrossValidator
-16
-17    def __init__(self, dataset, cross_validator, n_folds, x_attr='text', y_attr='label'):
-18        self.cross_val_cls = cross_validator
-19        self.dataset = dataset
-20        self.fold = 0
-21        self.n_folds = n_folds
-22        self.x_attr = x_attr
-23        self.y_attr = y_attr
-24        self.folds = []
-25
-26    @classmethod
-27    def prepare_cross_validator(cls, data, evaluator: Evaluator,
-28                                n_splits: int, x_attr, y_attr, seed=42) -> 'CrossValidatorDataset':
-29        """
-30        Prepare the cross validator dataset object that will internally produce the folds.
-31
-32        :param data: Dataset to be used. Should be a list of dicts, or list of [x,y] or a Dataset object from data_sets
-33        :param evaluator: Evaluator to use for checking results
-34        :param n_splits: Number of cross validation splits, k-fold (stratified)
-35        :param seed: Seed to use (default 42)
-36        :param y_attr: Label, or idx of the y label
-37        :param x_attr: Label or idx of the x label (not directly used)
-38        """
-39
-40        return cls(data, CrossValidator(evaluator, data, x_attr, y_attr,
-41                                        n_splits=n_splits, seed=seed),
-42                   x_attr=x_attr, y_attr=y_attr,
-43                   n_folds=n_splits)
-44
-45    def get_fold(self, idx) -> datasets.DatasetDict:
-46        """
-47
-48        Get the fold and returns a dataset.DataDict object with
-49        DataDict{'train': ..., 'val': ...}
-50
-51        :param idx:
-52        """
-53
-54        train_idxs, val_idxs = self.cross_val_cls.get_fold(idx)
-55        dataset_dict = DatasetDict()
-56
-57        if self.cross_val_cls.dataset_type in [DatasetTypes.List, DatasetTypes.ListDict]:
-58            # TODO: figure out how to make this into a huggingface dataset object generically
-59            train_subset = [self.dataset[i] for i in train_idxs]
-60            val_subset = [self.dataset[i] for i in val_idxs]
-61        elif self.cross_val_cls.dataset_type == DatasetTypes.ListInputExample:
-62            train_subset = InputExample.to_dict([self.dataset[i] for i in train_idxs])
-63            val_subset = InputExample.to_dict([self.dataset[i] for i in val_idxs])
-64
-65            dataset_dict['train'] = datasets.Dataset.from_dict(train_subset)
-66            dataset_dict['val'] = datasets.Dataset.from_dict(val_subset)
-67
-68        elif self.cross_val_cls.dataset_type == DatasetTypes.HuggingfaceDataset:
-69            train_subset = self.dataset.select(train_idxs)
-70            val_subset = self.dataset.select(val_idxs)
-71
-72            dataset_dict['train'] = datasets.Dataset.from_dict(train_subset)
-73            dataset_dict['val'] = datasets.Dataset.from_dict(val_subset)
-74
-75        return dataset_dict
-
- - -
-
- -
- - class - CrossValidatorDataset: - - - -
- -
12class CrossValidatorDataset:
-13    """
-14    Cross Validator Dataset
-15    """
-16    cross_val_cls: CrossValidator
-17
-18    def __init__(self, dataset, cross_validator, n_folds, x_attr='text', y_attr='label'):
-19        self.cross_val_cls = cross_validator
-20        self.dataset = dataset
-21        self.fold = 0
-22        self.n_folds = n_folds
-23        self.x_attr = x_attr
-24        self.y_attr = y_attr
-25        self.folds = []
-26
-27    @classmethod
-28    def prepare_cross_validator(cls, data, evaluator: Evaluator,
-29                                n_splits: int, x_attr, y_attr, seed=42) -> 'CrossValidatorDataset':
-30        """
-31        Prepare the cross validator dataset object that will internally produce the folds.
-32
-33        :param data: Dataset to be used. Should be a list of dicts, or list of [x,y] or a Dataset object from data_sets
-34        :param evaluator: Evaluator to use for checking results
-35        :param n_splits: Number of cross validation splits, k-fold (stratified)
-36        :param seed: Seed to use (default 42)
-37        :param y_attr: Label, or idx of the y label
-38        :param x_attr: Label or idx of the x label (not directly used)
-39        """
-40
-41        return cls(data, CrossValidator(evaluator, data, x_attr, y_attr,
-42                                        n_splits=n_splits, seed=seed),
-43                   x_attr=x_attr, y_attr=y_attr,
-44                   n_folds=n_splits)
-45
-46    def get_fold(self, idx) -> datasets.DatasetDict:
-47        """
-48
-49        Get the fold and returns a dataset.DataDict object with
-50        DataDict{'train': ..., 'val': ...}
-51
-52        :param idx:
-53        """
-54
-55        train_idxs, val_idxs = self.cross_val_cls.get_fold(idx)
-56        dataset_dict = DatasetDict()
-57
-58        if self.cross_val_cls.dataset_type in [DatasetTypes.List, DatasetTypes.ListDict]:
-59            # TODO: figure out how to make this into a huggingface dataset object generically
-60            train_subset = [self.dataset[i] for i in train_idxs]
-61            val_subset = [self.dataset[i] for i in val_idxs]
-62        elif self.cross_val_cls.dataset_type == DatasetTypes.ListInputExample:
-63            train_subset = InputExample.to_dict([self.dataset[i] for i in train_idxs])
-64            val_subset = InputExample.to_dict([self.dataset[i] for i in val_idxs])
-65
-66            dataset_dict['train'] = datasets.Dataset.from_dict(train_subset)
-67            dataset_dict['val'] = datasets.Dataset.from_dict(val_subset)
-68
-69        elif self.cross_val_cls.dataset_type == DatasetTypes.HuggingfaceDataset:
-70            train_subset = self.dataset.select(train_idxs)
-71            val_subset = self.dataset.select(val_idxs)
-72
-73            dataset_dict['train'] = datasets.Dataset.from_dict(train_subset)
-74            dataset_dict['val'] = datasets.Dataset.from_dict(val_subset)
-75
-76        return dataset_dict
-
- - -

Cross Validator Dataset

-
- - -
- -
- - CrossValidatorDataset(dataset, cross_validator, n_folds, x_attr='text', y_attr='label') - - - -
- -
18    def __init__(self, dataset, cross_validator, n_folds, x_attr='text', y_attr='label'):
-19        self.cross_val_cls = cross_validator
-20        self.dataset = dataset
-21        self.fold = 0
-22        self.n_folds = n_folds
-23        self.x_attr = x_attr
-24        self.y_attr = y_attr
-25        self.folds = []
-
- - - - -
-
- -
-
@classmethod
- - def - prepare_cross_validator( cls, data, evaluator: debeir.evaluation.evaluator.Evaluator, n_splits: int, x_attr, y_attr, seed=42) -> debeir.data_sets.utils.CrossValidatorDataset: - - - -
- -
27    @classmethod
-28    def prepare_cross_validator(cls, data, evaluator: Evaluator,
-29                                n_splits: int, x_attr, y_attr, seed=42) -> 'CrossValidatorDataset':
-30        """
-31        Prepare the cross validator dataset object that will internally produce the folds.
-32
-33        :param data: Dataset to be used. Should be a list of dicts, or list of [x,y] or a Dataset object from data_sets
-34        :param evaluator: Evaluator to use for checking results
-35        :param n_splits: Number of cross validation splits, k-fold (stratified)
-36        :param seed: Seed to use (default 42)
-37        :param y_attr: Label, or idx of the y label
-38        :param x_attr: Label or idx of the x label (not directly used)
-39        """
-40
-41        return cls(data, CrossValidator(evaluator, data, x_attr, y_attr,
-42                                        n_splits=n_splits, seed=seed),
-43                   x_attr=x_attr, y_attr=y_attr,
-44                   n_folds=n_splits)
-
- - -

Prepare the cross validator dataset object that will internally produce the folds.

- -
Parameters
- -
    -
  • data: Dataset to be used. Should be a list of dicts, or list of [x,y] or a Dataset object from data_sets
  • -
  • evaluator: Evaluator to use for checking results
  • -
  • n_splits: Number of cross validation splits, k-fold (stratified)
  • -
  • seed: Seed to use (default 42)
  • -
  • y_attr: Label, or idx of the y label
  • -
  • x_attr: Label or idx of the x label (not directly used)
  • -
-
- - -
-
- -
- - def - get_fold(self, idx) -> datasets.dataset_dict.DatasetDict: - - - -
- -
46    def get_fold(self, idx) -> datasets.DatasetDict:
-47        """
-48
-49        Get the fold and returns a dataset.DataDict object with
-50        DataDict{'train': ..., 'val': ...}
-51
-52        :param idx:
-53        """
-54
-55        train_idxs, val_idxs = self.cross_val_cls.get_fold(idx)
-56        dataset_dict = DatasetDict()
-57
-58        if self.cross_val_cls.dataset_type in [DatasetTypes.List, DatasetTypes.ListDict]:
-59            # TODO: figure out how to make this into a huggingface dataset object generically
-60            train_subset = [self.dataset[i] for i in train_idxs]
-61            val_subset = [self.dataset[i] for i in val_idxs]
-62        elif self.cross_val_cls.dataset_type == DatasetTypes.ListInputExample:
-63            train_subset = InputExample.to_dict([self.dataset[i] for i in train_idxs])
-64            val_subset = InputExample.to_dict([self.dataset[i] for i in val_idxs])
-65
-66            dataset_dict['train'] = datasets.Dataset.from_dict(train_subset)
-67            dataset_dict['val'] = datasets.Dataset.from_dict(val_subset)
-68
-69        elif self.cross_val_cls.dataset_type == DatasetTypes.HuggingfaceDataset:
-70            train_subset = self.dataset.select(train_idxs)
-71            val_subset = self.dataset.select(val_idxs)
-72
-73            dataset_dict['train'] = datasets.Dataset.from_dict(train_subset)
-74            dataset_dict['val'] = datasets.Dataset.from_dict(val_subset)
-75
-76        return dataset_dict
-
- - -

Get the fold and returns a dataset.DataDict object with -DataDict{'train': ..., 'val': ...}

- -
Parameters
- -
    -
  • idx:
  • -
-
- - -
-
-
- - \ No newline at end of file diff --git a/docs/debeir/datasets.html b/docs/debeir/datasets.html index 7c0bfb4..d335441 100644 --- a/docs/debeir/datasets.html +++ b/docs/debeir/datasets.html @@ -53,7 +53,7 @@

Submodules

debeir.datasets

-

Contains data_sets implemented from nir.interfaces

+

Contains data_sets implemented from nir.core

  1. Parser (For reading data from files into a Dict object)
  2. @@ -69,7 +69,7 @@

    1"""
    -2Contains data_sets implemented from nir.interfaces
    +2Contains data_sets implemented from nir.core
     31. Parser (For reading data from files into a Dict object)
     42. Query object (Generating queries)
     5    - These query objects can be very lightweight containing only the mappings of the index.
    diff --git a/docs/debeir/evaluation/evaluator.html b/docs/debeir/evaluation/evaluator.html
    index 849f94c..6187de1 100644
    --- a/docs/debeir/evaluation/evaluator.html
    +++ b/docs/debeir/evaluation/evaluator.html
    @@ -319,7 +319,7 @@ 
    Returns
    def - average_all_metrics( self, runs: Dict, logger: <loguru.logger handlers=[(id=0, level=10, sink=<_io.StringIO object at 0x105cfa710>)]>): + average_all_metrics( self, runs: Dict, logger: <loguru.logger handlers=[(id=0, level=10, sink=<_io.StringIO object at 0x103af2710>)]>): diff --git a/docs/debeir/interfaces.html b/docs/debeir/interfaces.html deleted file mode 100644 index b18bcb0..0000000 --- a/docs/debeir/interfaces.html +++ /dev/null @@ -1,254 +0,0 @@ - - - - - - - debeir.interfaces API documentation - - - - - - - - - -
    -
    -

    -debeir.interfaces

    - -

    Interfaces to implement custom data_sets in nir.data_sets.

    -
    - - - - - -
    1"""
    -2Interfaces to implement custom data_sets in nir.data_sets.
    -3"""
    -
    - - -
    -
    - - \ No newline at end of file diff --git a/docs/debeir/interfaces/callbacks.html b/docs/debeir/interfaces/callbacks.html deleted file mode 100644 index 75e339f..0000000 --- a/docs/debeir/interfaces/callbacks.html +++ /dev/null @@ -1,859 +0,0 @@ - - - - - - - debeir.interfaces.callbacks API documentation - - - - - - - - - -
    -
    -

    -debeir.interfaces.callbacks

    - -

    Callbacks for before after running. -E.g. before is for setup -after is for evaluation/serialization etc

    -
    - - - - - -
      1"""
    -  2Callbacks for before after running.
    -  3E.g. before is for setup
    -  4after is for evaluation/serialization etc
    -  5"""
    -  6
    -  7import abc
    -  8import os
    -  9import tempfile
    - 10import uuid
    - 11import loguru
    - 12
    - 13from typing import List
    - 14from debeir.interfaces.pipeline import Pipeline
    - 15from debeir.data_sets.factory import query_factory
    - 16from debeir.evaluation.evaluator import Evaluator
    - 17from debeir.interfaces.config import GenericConfig, NIRConfig
    - 18
    - 19
    - 20class Callback:
    - 21    def __init__(self):
    - 22        self.pipeline = None
    - 23
    - 24    @abc.abstractmethod
    - 25    def before(self, pipeline: Pipeline):
    - 26        pass
    - 27
    - 28    @abc.abstractmethod
    - 29    def after(self, results: List):
    - 30        pass
    - 31
    - 32
    - 33class SerializationCallback(Callback):
    - 34    def __init__(self, config: GenericConfig, nir_config: NIRConfig):
    - 35        super().__init__()
    - 36        self.config = config
    - 37        self.nir_config = nir_config
    - 38        self.output_file = None
    - 39        self.query_cls = query_factory[self.config.query_fn]
    - 40
    - 41    def before(self, pipeline: Pipeline):
    - 42        """
    - 43        Check if output file exists
    - 44
    - 45        :return:
    - 46            Output file path
    - 47        """
    - 48
    - 49        self.pipeline = Pipeline
    - 50
    - 51        output_file = self.config.output_file
    - 52        output_dir = os.path.join(self.nir_config.output_directory, self.config.index)
    - 53
    - 54        if output_file is None:
    - 55            os.makedirs(name=output_dir, exist_ok=True)
    - 56            output_file = os.path.join(output_dir, str(uuid.uuid4()))
    - 57
    - 58            loguru.logger.info(f"Output file not specified, writing to: {output_file}")
    - 59
    - 60        else:
    - 61            output_file = os.path.join(output_dir, output_file)
    - 62
    - 63        if os.path.exists(output_file):
    - 64            if not self.config.overwrite_output_if_exists:
    - 65                raise RuntimeError("Directory exists and isn't explicitly overwritten "
    - 66                                   "in config with overwrite_output_if_exists=True")
    - 67
    - 68            loguru.logger.info(f"Output file exists: {output_file}. Overwriting...")
    - 69            open(output_file, "w+").close()
    - 70
    - 71        pipeline.output_file = output_file
    - 72        self.output_file = output_file
    - 73
    - 74    def after(self, results: List):
    - 75        """
    - 76        Serialize results to self.output_file in a TREC-style format
    - 77        :param topic_num: Topic number to serialize
    - 78        :param res: Raw elasticsearch result
    - 79        :param run_name: The run name for TREC-style runs (default: NO_RUN_NAME)
    - 80        """
    - 81
    - 82        self._after(results,
    - 83                    output_file=self.output_file,
    - 84                    query_cls=self.query_cls,
    - 85                    run_name=self.config.run_name)
    - 86
    - 87    @classmethod
    - 88    def _after(self, results: List, output_file, query_cls, run_name=None):
    - 89        if run_name is None:
    - 90            run_name = "NO_RUN_NAME"
    - 91
    - 92        with open(output_file, "a+t") as writer:
    - 93            for (topic_num, res) in results:
    - 94                for rank, result in enumerate(res["hits"]["hits"], start=1):
    - 95                    doc_id = None
    - 96
    - 97                    # if self.return_id_only:
    - 98                    #    doc_id = self.query.get_id_mapping(result["fields"])[0]
    - 99                    # else:
    -100                    doc_id = query_cls.get_id_mapping(result["_source"])
    -101
    -102                    line = f"{topic_num}\t" \
    -103                           f"Q0\t" \
    -104                           f"{doc_id}\t" \
    -105                           f"{rank}\t" \
    -106                           f"{result['_score']}\t" \
    -107                           f"{run_name}\n"
    -108
    -109                    writer.write(line)
    -110
    -111
    -112class EvaluationCallback(Callback):
    -113    def __init__(self, evaluator: Evaluator, config):
    -114        super().__init__()
    -115        self.evaluator = evaluator
    -116        self.config = config
    -117        self.parsed_run = None
    -118
    -119    def before(self, pipeline: Pipeline):
    -120        self.pipeline = Pipeline
    -121
    -122    def after(self, results: List, id_field="id"):
    -123        if self.pipeline.output_file is None:
    -124            directory_name = tempfile.mkdtemp()
    -125            fn = str(uuid.uuid4())
    -126
    -127            fp = os.path.join(directory_name, fn)
    -128
    -129            query = query_factory[self.config.query_fn]
    -130            query.id_field = id_field
    -131
    -132            SerializationCallback._after(results,
    -133                                         output_file=fp,
    -134                                         query_cls=query,
    -135                                         run_name=self.config.run_name)
    -136
    -137            self.pipeline.output_file = fp
    -138
    -139        parsed_run = self.evaluator.evaluate_runs(self.pipeline.output_file, disable_cache=True)
    -140        self.parsed_run = parsed_run
    -141
    -142        return self.parsed_run
    -
    - - -
    -
    - -
    - - class - Callback: - - - -
    - -
    21class Callback:
    -22    def __init__(self):
    -23        self.pipeline = None
    -24
    -25    @abc.abstractmethod
    -26    def before(self, pipeline: Pipeline):
    -27        pass
    -28
    -29    @abc.abstractmethod
    -30    def after(self, results: List):
    -31        pass
    -
    - - - - -
    - -
    - - Callback() - - - -
    - -
    22    def __init__(self):
    -23        self.pipeline = None
    -
    - - - - -
    -
    - -
    -
    @abc.abstractmethod
    - - def - before(self, pipeline: debeir.interfaces.pipeline.Pipeline): - - - -
    - -
    25    @abc.abstractmethod
    -26    def before(self, pipeline: Pipeline):
    -27        pass
    -
    - - - - -
    -
    - -
    -
    @abc.abstractmethod
    - - def - after(self, results: List): - - - -
    - -
    29    @abc.abstractmethod
    -30    def after(self, results: List):
    -31        pass
    -
    - - - - -
    -
    -
    - -
    - - class - SerializationCallback(Callback): - - - -
    - -
     34class SerializationCallback(Callback):
    - 35    def __init__(self, config: GenericConfig, nir_config: NIRConfig):
    - 36        super().__init__()
    - 37        self.config = config
    - 38        self.nir_config = nir_config
    - 39        self.output_file = None
    - 40        self.query_cls = query_factory[self.config.query_fn]
    - 41
    - 42    def before(self, pipeline: Pipeline):
    - 43        """
    - 44        Check if output file exists
    - 45
    - 46        :return:
    - 47            Output file path
    - 48        """
    - 49
    - 50        self.pipeline = Pipeline
    - 51
    - 52        output_file = self.config.output_file
    - 53        output_dir = os.path.join(self.nir_config.output_directory, self.config.index)
    - 54
    - 55        if output_file is None:
    - 56            os.makedirs(name=output_dir, exist_ok=True)
    - 57            output_file = os.path.join(output_dir, str(uuid.uuid4()))
    - 58
    - 59            loguru.logger.info(f"Output file not specified, writing to: {output_file}")
    - 60
    - 61        else:
    - 62            output_file = os.path.join(output_dir, output_file)
    - 63
    - 64        if os.path.exists(output_file):
    - 65            if not self.config.overwrite_output_if_exists:
    - 66                raise RuntimeError("Directory exists and isn't explicitly overwritten "
    - 67                                   "in config with overwrite_output_if_exists=True")
    - 68
    - 69            loguru.logger.info(f"Output file exists: {output_file}. Overwriting...")
    - 70            open(output_file, "w+").close()
    - 71
    - 72        pipeline.output_file = output_file
    - 73        self.output_file = output_file
    - 74
    - 75    def after(self, results: List):
    - 76        """
    - 77        Serialize results to self.output_file in a TREC-style format
    - 78        :param topic_num: Topic number to serialize
    - 79        :param res: Raw elasticsearch result
    - 80        :param run_name: The run name for TREC-style runs (default: NO_RUN_NAME)
    - 81        """
    - 82
    - 83        self._after(results,
    - 84                    output_file=self.output_file,
    - 85                    query_cls=self.query_cls,
    - 86                    run_name=self.config.run_name)
    - 87
    - 88    @classmethod
    - 89    def _after(self, results: List, output_file, query_cls, run_name=None):
    - 90        if run_name is None:
    - 91            run_name = "NO_RUN_NAME"
    - 92
    - 93        with open(output_file, "a+t") as writer:
    - 94            for (topic_num, res) in results:
    - 95                for rank, result in enumerate(res["hits"]["hits"], start=1):
    - 96                    doc_id = None
    - 97
    - 98                    # if self.return_id_only:
    - 99                    #    doc_id = self.query.get_id_mapping(result["fields"])[0]
    -100                    # else:
    -101                    doc_id = query_cls.get_id_mapping(result["_source"])
    -102
    -103                    line = f"{topic_num}\t" \
    -104                           f"Q0\t" \
    -105                           f"{doc_id}\t" \
    -106                           f"{rank}\t" \
    -107                           f"{result['_score']}\t" \
    -108                           f"{run_name}\n"
    -109
    -110                    writer.write(line)
    -
    - - - - -
    - -
    - - SerializationCallback( config: debeir.interfaces.config.GenericConfig, nir_config: debeir.interfaces.config.NIRConfig) - - - -
    - -
    35    def __init__(self, config: GenericConfig, nir_config: NIRConfig):
    -36        super().__init__()
    -37        self.config = config
    -38        self.nir_config = nir_config
    -39        self.output_file = None
    -40        self.query_cls = query_factory[self.config.query_fn]
    -
    - - - - -
    -
    - -
    - - def - before(self, pipeline: debeir.interfaces.pipeline.Pipeline): - - - -
    - -
    42    def before(self, pipeline: Pipeline):
    -43        """
    -44        Check if output file exists
    -45
    -46        :return:
    -47            Output file path
    -48        """
    -49
    -50        self.pipeline = Pipeline
    -51
    -52        output_file = self.config.output_file
    -53        output_dir = os.path.join(self.nir_config.output_directory, self.config.index)
    -54
    -55        if output_file is None:
    -56            os.makedirs(name=output_dir, exist_ok=True)
    -57            output_file = os.path.join(output_dir, str(uuid.uuid4()))
    -58
    -59            loguru.logger.info(f"Output file not specified, writing to: {output_file}")
    -60
    -61        else:
    -62            output_file = os.path.join(output_dir, output_file)
    -63
    -64        if os.path.exists(output_file):
    -65            if not self.config.overwrite_output_if_exists:
    -66                raise RuntimeError("Directory exists and isn't explicitly overwritten "
    -67                                   "in config with overwrite_output_if_exists=True")
    -68
    -69            loguru.logger.info(f"Output file exists: {output_file}. Overwriting...")
    -70            open(output_file, "w+").close()
    -71
    -72        pipeline.output_file = output_file
    -73        self.output_file = output_file
    -
    - - -

    Check if output file exists

    - -
    Returns
    - -
    -
    Output file path
    -
    -
    -
    - - -
    -
    - -
    - - def - after(self, results: List): - - - -
    - -
    75    def after(self, results: List):
    -76        """
    -77        Serialize results to self.output_file in a TREC-style format
    -78        :param topic_num: Topic number to serialize
    -79        :param res: Raw elasticsearch result
    -80        :param run_name: The run name for TREC-style runs (default: NO_RUN_NAME)
    -81        """
    -82
    -83        self._after(results,
    -84                    output_file=self.output_file,
    -85                    query_cls=self.query_cls,
    -86                    run_name=self.config.run_name)
    -
    - - -

    Serialize results to self.output_file in a TREC-style format

    - -
    Parameters
    - -
      -
    • topic_num: Topic number to serialize
    • -
    • res: Raw elasticsearch result
    • -
    • run_name: The run name for TREC-style runs (default: NO_RUN_NAME)
    • -
    -
    - - -
    -
    -
    - -
    - - class - EvaluationCallback(Callback): - - - -
    - -
    113class EvaluationCallback(Callback):
    -114    def __init__(self, evaluator: Evaluator, config):
    -115        super().__init__()
    -116        self.evaluator = evaluator
    -117        self.config = config
    -118        self.parsed_run = None
    -119
    -120    def before(self, pipeline: Pipeline):
    -121        self.pipeline = Pipeline
    -122
    -123    def after(self, results: List, id_field="id"):
    -124        if self.pipeline.output_file is None:
    -125            directory_name = tempfile.mkdtemp()
    -126            fn = str(uuid.uuid4())
    -127
    -128            fp = os.path.join(directory_name, fn)
    -129
    -130            query = query_factory[self.config.query_fn]
    -131            query.id_field = id_field
    -132
    -133            SerializationCallback._after(results,
    -134                                         output_file=fp,
    -135                                         query_cls=query,
    -136                                         run_name=self.config.run_name)
    -137
    -138            self.pipeline.output_file = fp
    -139
    -140        parsed_run = self.evaluator.evaluate_runs(self.pipeline.output_file, disable_cache=True)
    -141        self.parsed_run = parsed_run
    -142
    -143        return self.parsed_run
    -
    - - - - -
    - -
    - - EvaluationCallback(evaluator: debeir.evaluation.evaluator.Evaluator, config) - - - -
    - -
    114    def __init__(self, evaluator: Evaluator, config):
    -115        super().__init__()
    -116        self.evaluator = evaluator
    -117        self.config = config
    -118        self.parsed_run = None
    -
    - - - - -
    -
    - -
    - - def - before(self, pipeline: debeir.interfaces.pipeline.Pipeline): - - - -
    - -
    120    def before(self, pipeline: Pipeline):
    -121        self.pipeline = Pipeline
    -
    - - - - -
    -
    - -
    - - def - after(self, results: List, id_field='id'): - - - -
    - -
    123    def after(self, results: List, id_field="id"):
    -124        if self.pipeline.output_file is None:
    -125            directory_name = tempfile.mkdtemp()
    -126            fn = str(uuid.uuid4())
    -127
    -128            fp = os.path.join(directory_name, fn)
    -129
    -130            query = query_factory[self.config.query_fn]
    -131            query.id_field = id_field
    -132
    -133            SerializationCallback._after(results,
    -134                                         output_file=fp,
    -135                                         query_cls=query,
    -136                                         run_name=self.config.run_name)
    -137
    -138            self.pipeline.output_file = fp
    -139
    -140        parsed_run = self.evaluator.evaluate_runs(self.pipeline.output_file, disable_cache=True)
    -141        self.parsed_run = parsed_run
    -142
    -143        return self.parsed_run
    -
    - - - - -
    -
    -
    - - \ No newline at end of file diff --git a/docs/debeir/interfaces/config.html b/docs/debeir/interfaces/config.html deleted file mode 100644 index f79f8c7..0000000 --- a/docs/debeir/interfaces/config.html +++ /dev/null @@ -1,1480 +0,0 @@ - - - - - - - debeir.interfaces.config API documentation - - - - - - - - - -
    -
    -

    -debeir.interfaces.config

    - - - - - - -
      1import abc
    -  2import dataclasses
    -  3import os
    -  4from abc import ABC
    -  5from dataclasses import dataclass
    -  6from pathlib import Path
    -  7from typing import List, MutableMapping, Dict, Union
    -  8
    -  9import loguru
    - 10import toml
    - 11
    - 12
    - 13class Config:
    - 14    """
    - 15    Config Interface with creation class methods
    - 16    """
    - 17
    - 18    def __update__(self, **kwargs):
    - 19        attrs = vars(self)
    - 20        kwargs.update(attrs)
    - 21
    - 22        return kwargs
    - 23
    - 24    @classmethod
    - 25    def from_toml(cls, fp: Union[str, Path], field_class, *args, **kwargs) -> 'Config':
    - 26        """
    - 27        Instantiates a Config object from a toml file
    - 28
    - 29        :param fp: File path of the Config TOML file
    - 30        :param field_class: Class of the Config object to be instantiated
    - 31        :param args: Arguments to be passed to Config
    - 32        :param kwargs: Keyword arguments to be passed
    - 33        :return:
    - 34            A instantiated and validated Config object.
    - 35        """
    - 36        args_dict = toml.load(fp)
    - 37
    - 38        return cls.from_args(args_dict, field_class, *args, **kwargs)
    - 39
    - 40    @classmethod
    - 41    def from_args(cls, args_dict: MutableMapping, field_class, *args, **kwargs):
    - 42        """
    - 43        Instantiates a Config object from arguments
    - 44
    - 45
    - 46        :param args_dict:
    - 47        :param field_class:
    - 48        :param args:
    - 49        :param kwargs:
    - 50        :return:
    - 51        """
    - 52        from debeir.rankers.transformer_sent_encoder import Encoder
    - 53
    - 54        field_names = set(f.name for f in dataclasses.fields(field_class))
    - 55        obj = field_class(**{k: v for k, v in args_dict.items() if k in field_names})
    - 56        if hasattr(obj, 'encoder_fp') and obj.encoder_fp:
    - 57            obj.encoder = Encoder(obj.encoder_fp, obj.encoder_normalize)
    - 58
    - 59        obj.validate()
    - 60
    - 61        return obj
    - 62
    - 63    @classmethod
    - 64    def from_dict(cls, data_class, **kwargs):
    - 65        """
    - 66        Instantiates a Config object from a dictionary
    - 67
    - 68        :param data_class:
    - 69        :param kwargs:
    - 70        :return:
    - 71        """
    - 72        from debeir.rankers.transformer_sent_encoder import Encoder
    - 73
    - 74        if "encoder_fp" in kwargs and kwargs["encoder_fp"]:
    - 75            kwargs["encoder"] = Encoder(kwargs["encoder_fp"])
    - 76
    - 77        field_names = set(f.name for f in dataclasses.fields(data_class))
    - 78        obj = data_class(**{k: v for k, v in kwargs.items() if k in field_names})
    - 79        obj.validate(0)
    - 80
    - 81        return obj
    - 82
    - 83    @abc.abstractmethod
    - 84    def validate(self):
    - 85        """
    - 86        Validates if the config is correct.
    - 87        Must be implemented by inherited classes.
    - 88        """
    - 89        pass
    - 90
    - 91
    - 92@dataclass(init=True, unsafe_hash=True)
    - 93class GenericConfig(Config, ABC):
    - 94    """
    - 95    Generic NIR Configuration file for which all configs will inherit
    - 96    """
    - 97    query_type: str
    - 98    index: str = None
    - 99    encoder_normalize: bool = True
    -100    ablations: bool = False
    -101    norm_weight: float = None
    -102    automatic: bool = None
    -103    encoder: object = None
    -104    encoder_fp: str = None
    -105    query_weights: List[float] = None
    -106    cosine_weights: List[float] = None
    -107    evaluate: bool = False
    -108    qrels: str = None
    -109    config_fn: str = None
    -110    query_fn: str = None
    -111    parser_fn: str = None
    -112    executor_fn: str = None
    -113    cosine_ceiling: float = None
    -114    topics_path: str = None
    -115    return_id_only: bool = False
    -116    overwrite_output_if_exists: bool = False
    -117    output_file: str = None
    -118    run_name: str = None
    -119
    -120
    -121    @classmethod
    -122    def from_toml(cls, fp: Union[str, Path], *args, **kwargs) -> 'GenericConfig':
    -123        return Config.from_toml(fp, cls, *args, **kwargs)
    -124
    -125
    -126@dataclass(init=True)
    -127class _NIRMasterConfig(Config):
    -128    """
    -129    Base NIR Master config: nir.toml
    -130    """
    -131    metrics: Dict
    -132    search: Dict
    -133    nir: Dict
    -134
    -135    def get_metrics(self, key='common', return_as_instance=False):
    -136        metrics = self.metrics[key]
    -137        if return_as_instance:
    -138            return MetricsConfig.from_args(metrics, MetricsConfig)
    -139
    -140        return metrics
    -141
    -142    def get_search_engine_settings(self, key='elasticsearch', return_as_instance=False):
    -143        engine_settings = self.search['engines'][key]
    -144        if return_as_instance:
    -145            return ElasticsearchConfig.from_args(engine_settings, ElasticsearchConfig)
    -146
    -147        return engine_settings
    -148
    -149
    -150    def get_nir_settings(self, key='default_settings', return_as_instance=False):
    -151        nir_settings = self.nir[key]
    -152
    -153        if return_as_instance:
    -154            return NIRConfig.from_args(nir_settings, NIRConfig)
    -155
    -156        return nir_settings
    -157
    -158    def validate(self):
    -159        return True
    -160
    -161
    -162@dataclass(init=True)
    -163class ElasticsearchConfig(Config):
    -164    """
    -165    Basic Elasticsearch configuration file settings from the master nir.toml file
    -166    """
    -167    protocol: str
    -168    ip: str
    -169    port: str
    -170    timeout: int
    -171
    -172    def validate(self):
    -173        """
    -174        Checks if Elasticsearch URL is correct
    -175        """
    -176        assert self.protocol in ['http', 'https']
    -177        assert self.port.isdigit()
    -178
    -179
    -180@dataclass(init=True)
    -181class SolrConfig(ElasticsearchConfig):
    -182    """
    -183    Basic Solr configuration file settings from the master nir.toml file
    -184    """
    -185    pass
    -186
    -187
    -188@dataclass(init=True)
    -189class MetricsConfig(Config):
    -190    """
    -191    Basic Metrics configuration file settings from the master nir.toml file
    -192    """
    -193    metrics: List[str]
    -194
    -195    def validate(self):
    -196        """
    -197        Checks if each Metrics is usable by evaluator classes
    -198        """
    -199        for metric in self.metrics:
    -200            assert "@" in metric
    -201
    -202            metric, depth = metric.split("@")
    -203
    -204            assert metric.isalpha()
    -205            assert depth.isdigit()
    -206
    -207
    -208@dataclass(init=True)
    -209class NIRConfig(Config):
    -210    """
    -211    Basic NIR configuration file settings from the master nir.toml file
    -212    """
    -213    norm_weight: str
    -214    evaluate: bool
    -215    return_size: int
    -216    output_directory: str
    -217
    -218    def validate(self):
    -219        return True
    -220
    -221
    -222def apply_config(func):
    -223    """
    -224    Configuration decorator.
    -225
    -226    :param func: Decorated function
    -227    :return:
    -228    """
    -229
    -230    def use_config(self, *args, **kwargs):
    -231        """
    -232        Replaces keywords and args passed to the function with ones from self.config.
    -233
    -234        :param self:
    -235        :param args: To be updated
    -236        :param kwargs: To be updated
    -237        :return:
    -238        """
    -239        if self.config is not None:
    -240            kwargs = self.config.__update__(**kwargs)
    -241
    -242        return func(self, *args, **kwargs)
    -243
    -244    return use_config
    -245
    -246
    -247def override_with_toml_config(func):
    -248    """
    -249    Configuration decorator. Overwrite a functions kwargs and args with a specified toml config file.
    -250    Pass override_with_config=path/to/config
    -251
    -252    :param func: Decorated function
    -253    :return:
    -254    """
    -255
    -256    def override_with(override_with_config_: str = None, *args, **kwargs):
    -257        """
    -258        Replaces keywords and args passed to the function with ones from self.config.
    -259
    -260        :param override_with_config_: Path to config else None
    -261        :param args: To be updated
    -262        :param kwargs: To be updated
    -263        :return:
    -264        """
    -265
    -266        if f"override_{func.__name__}_with_config_" in kwargs:
    -267            override_with_config_ = f"override_{func.__name__}_with_config_"
    -268
    -269        if override_with_config_ is not None:
    -270            if os.path.exists(override_with_config_):
    -271                toml_kwargs = toml.load(override_with_config_)
    -272                kwargs = kwargs.update(**toml_kwargs)
    -273
    -274        return func(*args, **kwargs)
    -275
    -276    return override_with
    -277
    -278
    -279def save_kwargs_to_file(func):
    -280    def save_kwargs(save_kwargs_to_: str = None, *args, **kwargs):
    -281        """
    -282        Save kwargs passed to the function output_file = f"{save_kwargs_to_}_{func.__name__}.toml"
    -283
    -284        :param save_kwargs_to_: Path to save location for config else None. This should be a DIRECTORY.
    -285        :param args: To be updated
    -286        :param kwargs: To be updated
    -287        :return:
    -288        """
    -289        if save_kwargs_to_ is not None:
    -290            os.makedirs(save_kwargs_to_, exist_ok=True)
    -291
    -292            if os.path.exists(save_kwargs_to_):
    -293                output_file = f"{save_kwargs_to_}/{func.__name__}.toml"
    -294                loguru.logger.info(f"Saving kwargs to {output_file}")
    -295                toml.dump(kwargs, open(output_file, "w+"))
    -296
    -297        return func(*args, **kwargs)
    -298
    -299    return save_kwargs
    -
    - - -
    -
    - -
    - - class - Config: - - - -
    - -
    14class Config:
    -15    """
    -16    Config Interface with creation class methods
    -17    """
    -18
    -19    def __update__(self, **kwargs):
    -20        attrs = vars(self)
    -21        kwargs.update(attrs)
    -22
    -23        return kwargs
    -24
    -25    @classmethod
    -26    def from_toml(cls, fp: Union[str, Path], field_class, *args, **kwargs) -> 'Config':
    -27        """
    -28        Instantiates a Config object from a toml file
    -29
    -30        :param fp: File path of the Config TOML file
    -31        :param field_class: Class of the Config object to be instantiated
    -32        :param args: Arguments to be passed to Config
    -33        :param kwargs: Keyword arguments to be passed
    -34        :return:
    -35            A instantiated and validated Config object.
    -36        """
    -37        args_dict = toml.load(fp)
    -38
    -39        return cls.from_args(args_dict, field_class, *args, **kwargs)
    -40
    -41    @classmethod
    -42    def from_args(cls, args_dict: MutableMapping, field_class, *args, **kwargs):
    -43        """
    -44        Instantiates a Config object from arguments
    -45
    -46
    -47        :param args_dict:
    -48        :param field_class:
    -49        :param args:
    -50        :param kwargs:
    -51        :return:
    -52        """
    -53        from debeir.rankers.transformer_sent_encoder import Encoder
    -54
    -55        field_names = set(f.name for f in dataclasses.fields(field_class))
    -56        obj = field_class(**{k: v for k, v in args_dict.items() if k in field_names})
    -57        if hasattr(obj, 'encoder_fp') and obj.encoder_fp:
    -58            obj.encoder = Encoder(obj.encoder_fp, obj.encoder_normalize)
    -59
    -60        obj.validate()
    -61
    -62        return obj
    -63
    -64    @classmethod
    -65    def from_dict(cls, data_class, **kwargs):
    -66        """
    -67        Instantiates a Config object from a dictionary
    -68
    -69        :param data_class:
    -70        :param kwargs:
    -71        :return:
    -72        """
    -73        from debeir.rankers.transformer_sent_encoder import Encoder
    -74
    -75        if "encoder_fp" in kwargs and kwargs["encoder_fp"]:
    -76            kwargs["encoder"] = Encoder(kwargs["encoder_fp"])
    -77
    -78        field_names = set(f.name for f in dataclasses.fields(data_class))
    -79        obj = data_class(**{k: v for k, v in kwargs.items() if k in field_names})
    -80        obj.validate(0)
    -81
    -82        return obj
    -83
    -84    @abc.abstractmethod
    -85    def validate(self):
    -86        """
    -87        Validates if the config is correct.
    -88        Must be implemented by inherited classes.
    -89        """
    -90        pass
    -
    - - -

    Config Interface with creation class methods

    -
    - - -
    -
    - - Config() - - -
    - - - - -
    -
    - -
    -
    @classmethod
    - - def - from_toml( cls, fp: Union[str, pathlib.Path], field_class, *args, **kwargs) -> debeir.interfaces.config.Config: - - - -
    - -
    25    @classmethod
    -26    def from_toml(cls, fp: Union[str, Path], field_class, *args, **kwargs) -> 'Config':
    -27        """
    -28        Instantiates a Config object from a toml file
    -29
    -30        :param fp: File path of the Config TOML file
    -31        :param field_class: Class of the Config object to be instantiated
    -32        :param args: Arguments to be passed to Config
    -33        :param kwargs: Keyword arguments to be passed
    -34        :return:
    -35            A instantiated and validated Config object.
    -36        """
    -37        args_dict = toml.load(fp)
    -38
    -39        return cls.from_args(args_dict, field_class, *args, **kwargs)
    -
    - - -

    Instantiates a Config object from a toml file

    - -
    Parameters
    - -
      -
    • fp: File path of the Config TOML file
    • -
    • field_class: Class of the Config object to be instantiated
    • -
    • args: Arguments to be passed to Config
    • -
    • kwargs: Keyword arguments to be passed
    • -
    - -
    Returns
    - -
    -
    A instantiated and validated Config object.
    -
    -
    -
    - - -
    -
    - -
    -
    @classmethod
    - - def - from_args(cls, args_dict: MutableMapping, field_class, *args, **kwargs): - - - -
    - -
    41    @classmethod
    -42    def from_args(cls, args_dict: MutableMapping, field_class, *args, **kwargs):
    -43        """
    -44        Instantiates a Config object from arguments
    -45
    -46
    -47        :param args_dict:
    -48        :param field_class:
    -49        :param args:
    -50        :param kwargs:
    -51        :return:
    -52        """
    -53        from debeir.rankers.transformer_sent_encoder import Encoder
    -54
    -55        field_names = set(f.name for f in dataclasses.fields(field_class))
    -56        obj = field_class(**{k: v for k, v in args_dict.items() if k in field_names})
    -57        if hasattr(obj, 'encoder_fp') and obj.encoder_fp:
    -58            obj.encoder = Encoder(obj.encoder_fp, obj.encoder_normalize)
    -59
    -60        obj.validate()
    -61
    -62        return obj
    -
    - - -

    Instantiates a Config object from arguments

    - -
    Parameters
    - -
      -
    • args_dict:
    • -
    • field_class:
    • -
    • args:
    • -
    • kwargs:
    • -
    - -
    Returns
    -
    - - -
    -
    - -
    -
    @classmethod
    - - def - from_dict(cls, data_class, **kwargs): - - - -
    - -
    64    @classmethod
    -65    def from_dict(cls, data_class, **kwargs):
    -66        """
    -67        Instantiates a Config object from a dictionary
    -68
    -69        :param data_class:
    -70        :param kwargs:
    -71        :return:
    -72        """
    -73        from debeir.rankers.transformer_sent_encoder import Encoder
    -74
    -75        if "encoder_fp" in kwargs and kwargs["encoder_fp"]:
    -76            kwargs["encoder"] = Encoder(kwargs["encoder_fp"])
    -77
    -78        field_names = set(f.name for f in dataclasses.fields(data_class))
    -79        obj = data_class(**{k: v for k, v in kwargs.items() if k in field_names})
    -80        obj.validate(0)
    -81
    -82        return obj
    -
    - - -

    Instantiates a Config object from a dictionary

    - -
    Parameters
    - -
      -
    • data_class:
    • -
    • kwargs:
    • -
    - -
    Returns
    -
    - - -
    -
    - -
    -
    @abc.abstractmethod
    - - def - validate(self): - - - -
    - -
    84    @abc.abstractmethod
    -85    def validate(self):
    -86        """
    -87        Validates if the config is correct.
    -88        Must be implemented by inherited classes.
    -89        """
    -90        pass
    -
    - - -

    Validates if the config is correct. -Must be implemented by inherited classes.

    -
    - - -
    -
    -
    - -
    -
    @dataclass(init=True, unsafe_hash=True)
    - - class - GenericConfig(Config, abc.ABC): - - - -
    - -
     93@dataclass(init=True, unsafe_hash=True)
    - 94class GenericConfig(Config, ABC):
    - 95    """
    - 96    Generic NIR Configuration file for which all configs will inherit
    - 97    """
    - 98    query_type: str
    - 99    index: str = None
    -100    encoder_normalize: bool = True
    -101    ablations: bool = False
    -102    norm_weight: float = None
    -103    automatic: bool = None
    -104    encoder: object = None
    -105    encoder_fp: str = None
    -106    query_weights: List[float] = None
    -107    cosine_weights: List[float] = None
    -108    evaluate: bool = False
    -109    qrels: str = None
    -110    config_fn: str = None
    -111    query_fn: str = None
    -112    parser_fn: str = None
    -113    executor_fn: str = None
    -114    cosine_ceiling: float = None
    -115    topics_path: str = None
    -116    return_id_only: bool = False
    -117    overwrite_output_if_exists: bool = False
    -118    output_file: str = None
    -119    run_name: str = None
    -120
    -121
    -122    @classmethod
    -123    def from_toml(cls, fp: Union[str, Path], *args, **kwargs) -> 'GenericConfig':
    -124        return Config.from_toml(fp, cls, *args, **kwargs)
    -
    - - -

    Generic NIR Configuration file for which all configs will inherit

    -
    - - -
    -
    - - GenericConfig( query_type: str, index: str = None, encoder_normalize: bool = True, ablations: bool = False, norm_weight: float = None, automatic: bool = None, encoder: object = None, encoder_fp: str = None, query_weights: List[float] = None, cosine_weights: List[float] = None, evaluate: bool = False, qrels: str = None, config_fn: str = None, query_fn: str = None, parser_fn: str = None, executor_fn: str = None, cosine_ceiling: float = None, topics_path: str = None, return_id_only: bool = False, overwrite_output_if_exists: bool = False, output_file: str = None, run_name: str = None) - - -
    - - - - -
    -
    - -
    -
    @classmethod
    - - def - from_toml( cls, fp: Union[str, pathlib.Path], *args, **kwargs) -> debeir.interfaces.config.GenericConfig: - - - -
    - -
    122    @classmethod
    -123    def from_toml(cls, fp: Union[str, Path], *args, **kwargs) -> 'GenericConfig':
    -124        return Config.from_toml(fp, cls, *args, **kwargs)
    -
    - - -

    Instantiates a Config object from a toml file

    - -
    Parameters
    - -
      -
    • fp: File path of the Config TOML file
    • -
    • field_class: Class of the Config object to be instantiated
    • -
    • args: Arguments to be passed to Config
    • -
    • kwargs: Keyword arguments to be passed
    • -
    - -
    Returns
    - -
    -
    A instantiated and validated Config object.
    -
    -
    -
    - - -
    -
    -
    Inherited Members
    -
    - -
    -
    -
    -
    - -
    -
    @dataclass(init=True)
    - - class - ElasticsearchConfig(Config): - - - -
    - -
    163@dataclass(init=True)
    -164class ElasticsearchConfig(Config):
    -165    """
    -166    Basic Elasticsearch configuration file settings from the master nir.toml file
    -167    """
    -168    protocol: str
    -169    ip: str
    -170    port: str
    -171    timeout: int
    -172
    -173    def validate(self):
    -174        """
    -175        Checks if Elasticsearch URL is correct
    -176        """
    -177        assert self.protocol in ['http', 'https']
    -178        assert self.port.isdigit()
    -
    - - -

    Basic Elasticsearch configuration file settings from the master nir.toml file

    -
    - - -
    -
    - - ElasticsearchConfig(protocol: str, ip: str, port: str, timeout: int) - - -
    - - - - -
    -
    - -
    - - def - validate(self): - - - -
    - -
    173    def validate(self):
    -174        """
    -175        Checks if Elasticsearch URL is correct
    -176        """
    -177        assert self.protocol in ['http', 'https']
    -178        assert self.port.isdigit()
    -
    - - -

    Checks if Elasticsearch URL is correct

    -
    - - -
    -
    -
    Inherited Members
    -
    - -
    -
    -
    -
    - -
    -
    @dataclass(init=True)
    - - class - SolrConfig(ElasticsearchConfig): - - - -
    - -
    181@dataclass(init=True)
    -182class SolrConfig(ElasticsearchConfig):
    -183    """
    -184    Basic Solr configuration file settings from the master nir.toml file
    -185    """
    -186    pass
    -
    - - -

    Basic Solr configuration file settings from the master nir.toml file

    -
    - - -
    -
    - - SolrConfig(protocol: str, ip: str, port: str, timeout: int) - - -
    - - - - -
    -
    -
    Inherited Members
    -
    - - -
    -
    -
    -
    - -
    -
    @dataclass(init=True)
    - - class - MetricsConfig(Config): - - - -
    - -
    189@dataclass(init=True)
    -190class MetricsConfig(Config):
    -191    """
    -192    Basic Metrics configuration file settings from the master nir.toml file
    -193    """
    -194    metrics: List[str]
    -195
    -196    def validate(self):
    -197        """
    -198        Checks if each Metrics is usable by evaluator classes
    -199        """
    -200        for metric in self.metrics:
    -201            assert "@" in metric
    -202
    -203            metric, depth = metric.split("@")
    -204
    -205            assert metric.isalpha()
    -206            assert depth.isdigit()
    -
    - - -

    Basic Metrics configuration file settings from the master nir.toml file

    -
    - - -
    -
    - - MetricsConfig(metrics: List[str]) - - -
    - - - - -
    -
    - -
    - - def - validate(self): - - - -
    - -
    196    def validate(self):
    -197        """
    -198        Checks if each Metrics is usable by evaluator classes
    -199        """
    -200        for metric in self.metrics:
    -201            assert "@" in metric
    -202
    -203            metric, depth = metric.split("@")
    -204
    -205            assert metric.isalpha()
    -206            assert depth.isdigit()
    -
    - - -

    Checks if each Metrics is usable by evaluator classes

    -
    - - -
    -
    -
    Inherited Members
    -
    - -
    -
    -
    -
    - -
    -
    @dataclass(init=True)
    - - class - NIRConfig(Config): - - - -
    - -
    209@dataclass(init=True)
    -210class NIRConfig(Config):
    -211    """
    -212    Basic NIR configuration file settings from the master nir.toml file
    -213    """
    -214    norm_weight: str
    -215    evaluate: bool
    -216    return_size: int
    -217    output_directory: str
    -218
    -219    def validate(self):
    -220        return True
    -
    - - -

    Basic NIR configuration file settings from the master nir.toml file

    -
    - - -
    -
    - - NIRConfig( norm_weight: str, evaluate: bool, return_size: int, output_directory: str) - - -
    - - - - -
    -
    - -
    - - def - validate(self): - - - -
    - -
    219    def validate(self):
    -220        return True
    -
    - - -

    Validates if the config is correct. -Must be implemented by inherited classes.

    -
    - - -
    -
    -
    Inherited Members
    -
    - -
    -
    -
    -
    - -
    - - def - apply_config(func): - - - -
    - -
    223def apply_config(func):
    -224    """
    -225    Configuration decorator.
    -226
    -227    :param func: Decorated function
    -228    :return:
    -229    """
    -230
    -231    def use_config(self, *args, **kwargs):
    -232        """
    -233        Replaces keywords and args passed to the function with ones from self.config.
    -234
    -235        :param self:
    -236        :param args: To be updated
    -237        :param kwargs: To be updated
    -238        :return:
    -239        """
    -240        if self.config is not None:
    -241            kwargs = self.config.__update__(**kwargs)
    -242
    -243        return func(self, *args, **kwargs)
    -244
    -245    return use_config
    -
    - - -

    Configuration decorator.

    - -
    Parameters
    - -
      -
    • func: Decorated function
    • -
    - -
    Returns
    -
    - - -
    -
    - -
    - - def - override_with_toml_config(func): - - - -
    - -
    248def override_with_toml_config(func):
    -249    """
    -250    Configuration decorator. Overwrite a functions kwargs and args with a specified toml config file.
    -251    Pass override_with_config=path/to/config
    -252
    -253    :param func: Decorated function
    -254    :return:
    -255    """
    -256
    -257    def override_with(override_with_config_: str = None, *args, **kwargs):
    -258        """
    -259        Replaces keywords and args passed to the function with ones from self.config.
    -260
    -261        :param override_with_config_: Path to config else None
    -262        :param args: To be updated
    -263        :param kwargs: To be updated
    -264        :return:
    -265        """
    -266
    -267        if f"override_{func.__name__}_with_config_" in kwargs:
    -268            override_with_config_ = f"override_{func.__name__}_with_config_"
    -269
    -270        if override_with_config_ is not None:
    -271            if os.path.exists(override_with_config_):
    -272                toml_kwargs = toml.load(override_with_config_)
    -273                kwargs = kwargs.update(**toml_kwargs)
    -274
    -275        return func(*args, **kwargs)
    -276
    -277    return override_with
    -
    - - -

    Configuration decorator. Overwrite a functions kwargs and args with a specified toml config file. -Pass override_with_config=path/to/config

    - -
    Parameters
    - -
      -
    • func: Decorated function
    • -
    - -
    Returns
    -
    - - -
    -
    - -
    - - def - save_kwargs_to_file(func): - - - -
    - -
    280def save_kwargs_to_file(func):
    -281    def save_kwargs(save_kwargs_to_: str = None, *args, **kwargs):
    -282        """
    -283        Save kwargs passed to the function output_file = f"{save_kwargs_to_}_{func.__name__}.toml"
    -284
    -285        :param save_kwargs_to_: Path to save location for config else None. This should be a DIRECTORY.
    -286        :param args: To be updated
    -287        :param kwargs: To be updated
    -288        :return:
    -289        """
    -290        if save_kwargs_to_ is not None:
    -291            os.makedirs(save_kwargs_to_, exist_ok=True)
    -292
    -293            if os.path.exists(save_kwargs_to_):
    -294                output_file = f"{save_kwargs_to_}/{func.__name__}.toml"
    -295                loguru.logger.info(f"Saving kwargs to {output_file}")
    -296                toml.dump(kwargs, open(output_file, "w+"))
    -297
    -298        return func(*args, **kwargs)
    -299
    -300    return save_kwargs
    -
    - - - - -
    -
    - - \ No newline at end of file diff --git a/docs/debeir/interfaces/converters.html b/docs/debeir/interfaces/converters.html deleted file mode 100644 index 49d7be1..0000000 --- a/docs/debeir/interfaces/converters.html +++ /dev/null @@ -1,443 +0,0 @@ - - - - - - - debeir.interfaces.converters API documentation - - - - - - - - - -
    -
    -

    -debeir.interfaces.converters

    - - - - - - -
     1from collections import defaultdict
    - 2from typing import Dict, Union
    - 3from debeir.interfaces.parser import Parser
    - 4
    - 5import datasets
    - 6
    - 7
    - 8class ParsedTopicsToDataset:
    - 9    """
    -10    Converts a parser's output to a huggingface dataset object.
    -11    """
    -12    @classmethod
    -13    def convert(cls, parser: Parser, output: Dict[Union[str, int], Dict]):
    -14        """
    -15        Flatten a Dict of shape (traditional parser output)
    -16        {topic_id: {
    -17                "Facet_1": ...
    -18                "Facet_2": ...
    -19            }
    -20        }
    -21
    -22        ->
    -23
    -24        To a flattened arrow-like dataset.
    -25        {
    -26        topic_ids: [],
    -27        Facet_1s: [],
    -28        Facet_2s: [],
    -29        }
    -30
    -31        :param output: Topics output from the parser object
    -32        :return:
    -33        """
    -34        flattened_topics = defaultdict(lambda: [])
    -35
    -36        for topic_id, topic in output.items():
    -37            flattened_topics["topic_id"].append(topic_id)
    -38
    -39            for field in parser.parse_fields:
    -40                if field in topic:
    -41                    flattened_topics[field].append(topic[field])
    -42                else:
    -43                    flattened_topics[field].append(None)
    -44
    -45        return datasets.Dataset.from_dict(flattened_topics)
    -
    - - -
    -
    - -
    - - class - ParsedTopicsToDataset: - - - -
    - -
     9class ParsedTopicsToDataset:
    -10    """
    -11    Converts a parser's output to a huggingface dataset object.
    -12    """
    -13    @classmethod
    -14    def convert(cls, parser: Parser, output: Dict[Union[str, int], Dict]):
    -15        """
    -16        Flatten a Dict of shape (traditional parser output)
    -17        {topic_id: {
    -18                "Facet_1": ...
    -19                "Facet_2": ...
    -20            }
    -21        }
    -22
    -23        ->
    -24
    -25        To a flattened arrow-like dataset.
    -26        {
    -27        topic_ids: [],
    -28        Facet_1s: [],
    -29        Facet_2s: [],
    -30        }
    -31
    -32        :param output: Topics output from the parser object
    -33        :return:
    -34        """
    -35        flattened_topics = defaultdict(lambda: [])
    -36
    -37        for topic_id, topic in output.items():
    -38            flattened_topics["topic_id"].append(topic_id)
    -39
    -40            for field in parser.parse_fields:
    -41                if field in topic:
    -42                    flattened_topics[field].append(topic[field])
    -43                else:
    -44                    flattened_topics[field].append(None)
    -45
    -46        return datasets.Dataset.from_dict(flattened_topics)
    -
    - - -

    Converts a parser's output to a huggingface dataset object.

    -
    - - -
    -
    - - ParsedTopicsToDataset() - - -
    - - - - -
    -
    - -
    -
    @classmethod
    - - def - convert( cls, parser: debeir.interfaces.parser.Parser, output: Dict[Union[str, int], Dict]): - - - -
    - -
    13    @classmethod
    -14    def convert(cls, parser: Parser, output: Dict[Union[str, int], Dict]):
    -15        """
    -16        Flatten a Dict of shape (traditional parser output)
    -17        {topic_id: {
    -18                "Facet_1": ...
    -19                "Facet_2": ...
    -20            }
    -21        }
    -22
    -23        ->
    -24
    -25        To a flattened arrow-like dataset.
    -26        {
    -27        topic_ids: [],
    -28        Facet_1s: [],
    -29        Facet_2s: [],
    -30        }
    -31
    -32        :param output: Topics output from the parser object
    -33        :return:
    -34        """
    -35        flattened_topics = defaultdict(lambda: [])
    -36
    -37        for topic_id, topic in output.items():
    -38            flattened_topics["topic_id"].append(topic_id)
    -39
    -40            for field in parser.parse_fields:
    -41                if field in topic:
    -42                    flattened_topics[field].append(topic[field])
    -43                else:
    -44                    flattened_topics[field].append(None)
    -45
    -46        return datasets.Dataset.from_dict(flattened_topics)
    -
    - - -

    Flatten a Dict of shape (traditional parser output) -{topic_id: { - "Facet_1": ... - "Facet_2": ... - } -}

    - -

    ->

    - -

    To a flattened arrow-like dataset. -{ -topic_ids: [], -Facet_1s: [], -Facet_2s: [], -}

    - -
    Parameters
    - -
      -
    • output: Topics output from the parser object
    • -
    - -
    Returns
    -
    - - -
    -
    -
    - - \ No newline at end of file diff --git a/docs/debeir/interfaces/document.html b/docs/debeir/interfaces/document.html deleted file mode 100644 index 58f8a85..0000000 --- a/docs/debeir/interfaces/document.html +++ /dev/null @@ -1,772 +0,0 @@ - - - - - - - debeir.interfaces.document API documentation - - - - - - - - - -
    -
    -

    -debeir.interfaces.document

    - - - - - - -
      1import abc
    -  2import dataclasses
    -  3from typing import Union, Dict, List
    -  4
    -  5from debeir.utils.utils import flatten
    -  6
    -  7
    -  8@dataclasses.dataclass
    -  9class Document:
    - 10    """
    - 11    Generic Document class.
    - 12    Used as an interface for interacting across multiple indexes with different mappings.
    - 13    """
    - 14    doc_id: Union[int, float, str]
    - 15    facets: Dict
    - 16    score: Union[float, int] = 0.0
    - 17
    - 18    @classmethod
    - 19    @abc.abstractmethod
    - 20    def from_results(cls) -> List['Document']:
    - 21        """
    - 22        Produces a list of Document objects from raw results returned from the index
    - 23        """
    - 24        pass
    - 25
    - 26    def get_document_id(self):
    - 27        """
    - 28        :return:
    - 29            self.doc_id
    - 30        """
    - 31        return self.doc_id
    - 32
    - 33    def flatten_facets(self, *args, **kwargs):
    - 34        """
    - 35        Flattens multi-level internal document facets into a single level
    - 36            e.g. Doc['Upper']['Lower'] -> Doc['Upper_Lower']
    - 37        :param args:
    - 38        :param kwargs:
    - 39        """
    - 40        self.facets = flatten(self.facets, *args, **kwargs)
    - 41
    - 42    @classmethod
    - 43    def _get_document_facet(cls, intermediate_repr, key):
    - 44        return intermediate_repr[key]
    - 45
    - 46    def get_document_facet(self, key, sep="_"):
    - 47        """
    - 48        Retrieve a document facet
    - 49        Works for multidimensional keys or single
    - 50        :param key: Facet to retrieve
    - 51        :param sep: The seperator for multidimensional key
    - 52        :return:
    - 53            Returns the document facet given the key (field)
    - 54        """
    - 55        if sep in key:
    - 56            keys = key.split(sep)
    - 57
    - 58            intermediate_repr = self.facets
    - 59            for key in keys:
    - 60                intermediate_repr = self._get_document_facet(intermediate_repr, key)
    - 61
    - 62            return intermediate_repr
    - 63
    - 64        return self.facets[key]
    - 65
    - 66    def set(self, doc_id=None, facets=None, score=None, facet=None, facet_value=None) -> 'Document':
    - 67        """
    - 68        Set attributes of the object. Use keyword arguments to do so. Works as a builder class.
    - 69        doc.set(doc_id="123").set(facets={"title": "my title"})
    - 70        :param doc_id:
    - 71        :param facets:
    - 72        :param score:
    - 73        :param facet:
    - 74        :param facet_value:
    - 75
    - 76        :return:
    - 77            Returns document object
    - 78        """
    - 79        if doc_id is not None:
    - 80            self.doc_id = doc_id
    - 81
    - 82        if facets is not None:
    - 83            self.facets = facets
    - 84
    - 85        if score is not None:
    - 86            self.score = score
    - 87
    - 88        if facet is not None and facet_value is not None:
    - 89            self.facets[facet] = facet_value
    - 90
    - 91        return self
    - 92
    - 93    def _get_trec_format(self) -> str:
    - 94        """
    - 95        Returns TREC format for the document
    - 96        :return:
    - 97            A trec formatted string
    - 98        """
    - 99        return f"{self.score}"
    -100
    -101    @classmethod
    -102    def get_trec_format(cls, ranked_list: List['Document'], sort=True):
    -103        """
    -104        Get the trec format of a list of ranked documents. This function is a generator.
    -105
    -106        :param ranked_list: A list of Document-type objects
    -107        :param sort: Whether to sort the input list in descending order of score.
    -108        """
    -109
    -110        if sort:
    -111            ranked_list.sort(key=lambda doc: doc.score, reverse=True)
    -112
    -113        for document in ranked_list:
    -114            yield document._get_trec_format()
    -
    - - -
    -
    - -
    -
    @dataclasses.dataclass
    - - class - Document: - - - -
    - -
      9@dataclasses.dataclass
    - 10class Document:
    - 11    """
    - 12    Generic Document class.
    - 13    Used as an interface for interacting across multiple indexes with different mappings.
    - 14    """
    - 15    doc_id: Union[int, float, str]
    - 16    facets: Dict
    - 17    score: Union[float, int] = 0.0
    - 18
    - 19    @classmethod
    - 20    @abc.abstractmethod
    - 21    def from_results(cls) -> List['Document']:
    - 22        """
    - 23        Produces a list of Document objects from raw results returned from the index
    - 24        """
    - 25        pass
    - 26
    - 27    def get_document_id(self):
    - 28        """
    - 29        :return:
    - 30            self.doc_id
    - 31        """
    - 32        return self.doc_id
    - 33
    - 34    def flatten_facets(self, *args, **kwargs):
    - 35        """
    - 36        Flattens multi-level internal document facets into a single level
    - 37            e.g. Doc['Upper']['Lower'] -> Doc['Upper_Lower']
    - 38        :param args:
    - 39        :param kwargs:
    - 40        """
    - 41        self.facets = flatten(self.facets, *args, **kwargs)
    - 42
    - 43    @classmethod
    - 44    def _get_document_facet(cls, intermediate_repr, key):
    - 45        return intermediate_repr[key]
    - 46
    - 47    def get_document_facet(self, key, sep="_"):
    - 48        """
    - 49        Retrieve a document facet
    - 50        Works for multidimensional keys or single
    - 51        :param key: Facet to retrieve
    - 52        :param sep: The seperator for multidimensional key
    - 53        :return:
    - 54            Returns the document facet given the key (field)
    - 55        """
    - 56        if sep in key:
    - 57            keys = key.split(sep)
    - 58
    - 59            intermediate_repr = self.facets
    - 60            for key in keys:
    - 61                intermediate_repr = self._get_document_facet(intermediate_repr, key)
    - 62
    - 63            return intermediate_repr
    - 64
    - 65        return self.facets[key]
    - 66
    - 67    def set(self, doc_id=None, facets=None, score=None, facet=None, facet_value=None) -> 'Document':
    - 68        """
    - 69        Set attributes of the object. Use keyword arguments to do so. Works as a builder class.
    - 70        doc.set(doc_id="123").set(facets={"title": "my title"})
    - 71        :param doc_id:
    - 72        :param facets:
    - 73        :param score:
    - 74        :param facet:
    - 75        :param facet_value:
    - 76
    - 77        :return:
    - 78            Returns document object
    - 79        """
    - 80        if doc_id is not None:
    - 81            self.doc_id = doc_id
    - 82
    - 83        if facets is not None:
    - 84            self.facets = facets
    - 85
    - 86        if score is not None:
    - 87            self.score = score
    - 88
    - 89        if facet is not None and facet_value is not None:
    - 90            self.facets[facet] = facet_value
    - 91
    - 92        return self
    - 93
    - 94    def _get_trec_format(self) -> str:
    - 95        """
    - 96        Returns TREC format for the document
    - 97        :return:
    - 98            A trec formatted string
    - 99        """
    -100        return f"{self.score}"
    -101
    -102    @classmethod
    -103    def get_trec_format(cls, ranked_list: List['Document'], sort=True):
    -104        """
    -105        Get the trec format of a list of ranked documents. This function is a generator.
    -106
    -107        :param ranked_list: A list of Document-type objects
    -108        :param sort: Whether to sort the input list in descending order of score.
    -109        """
    -110
    -111        if sort:
    -112            ranked_list.sort(key=lambda doc: doc.score, reverse=True)
    -113
    -114        for document in ranked_list:
    -115            yield document._get_trec_format()
    -
    - - -

    Generic Document class. -Used as an interface for interacting across multiple indexes with different mappings.

    -
    - - -
    -
    - - Document( doc_id: Union[int, float, str], facets: Dict, score: Union[float, int] = 0.0) - - -
    - - - - -
    -
    - -
    -
    @classmethod
    -
    @abc.abstractmethod
    - - def - from_results(cls) -> List[debeir.interfaces.document.Document]: - - - -
    - -
    19    @classmethod
    -20    @abc.abstractmethod
    -21    def from_results(cls) -> List['Document']:
    -22        """
    -23        Produces a list of Document objects from raw results returned from the index
    -24        """
    -25        pass
    -
    - - -

    Produces a list of Document objects from raw results returned from the index

    -
    - - -
    -
    - -
    - - def - get_document_id(self): - - - -
    - -
    27    def get_document_id(self):
    -28        """
    -29        :return:
    -30            self.doc_id
    -31        """
    -32        return self.doc_id
    -
    - - -
    Returns
    - -
    -
    self.doc_id
    -
    -
    -
    - - -
    -
    - -
    - - def - flatten_facets(self, *args, **kwargs): - - - -
    - -
    34    def flatten_facets(self, *args, **kwargs):
    -35        """
    -36        Flattens multi-level internal document facets into a single level
    -37            e.g. Doc['Upper']['Lower'] -> Doc['Upper_Lower']
    -38        :param args:
    -39        :param kwargs:
    -40        """
    -41        self.facets = flatten(self.facets, *args, **kwargs)
    -
    - - -

    Flattens multi-level internal document facets into a single level - e.g. Doc['Upper']['Lower'] -> Doc['Upper_Lower']

    - -
    Parameters
    - -
      -
    • args:
    • -
    • kwargs:
    • -
    -
    - - -
    -
    - -
    - - def - get_document_facet(self, key, sep='_'): - - - -
    - -
    47    def get_document_facet(self, key, sep="_"):
    -48        """
    -49        Retrieve a document facet
    -50        Works for multidimensional keys or single
    -51        :param key: Facet to retrieve
    -52        :param sep: The seperator for multidimensional key
    -53        :return:
    -54            Returns the document facet given the key (field)
    -55        """
    -56        if sep in key:
    -57            keys = key.split(sep)
    -58
    -59            intermediate_repr = self.facets
    -60            for key in keys:
    -61                intermediate_repr = self._get_document_facet(intermediate_repr, key)
    -62
    -63            return intermediate_repr
    -64
    -65        return self.facets[key]
    -
    - - -

    Retrieve a document facet -Works for multidimensional keys or single

    - -
    Parameters
    - -
      -
    • key: Facet to retrieve
    • -
    • sep: The seperator for multidimensional key
    • -
    - -
    Returns
    - -
    -
    Returns the document facet given the key (field)
    -
    -
    -
    - - -
    -
    - -
    - - def - set( self, doc_id=None, facets=None, score=None, facet=None, facet_value=None) -> debeir.interfaces.document.Document: - - - -
    - -
    67    def set(self, doc_id=None, facets=None, score=None, facet=None, facet_value=None) -> 'Document':
    -68        """
    -69        Set attributes of the object. Use keyword arguments to do so. Works as a builder class.
    -70        doc.set(doc_id="123").set(facets={"title": "my title"})
    -71        :param doc_id:
    -72        :param facets:
    -73        :param score:
    -74        :param facet:
    -75        :param facet_value:
    -76
    -77        :return:
    -78            Returns document object
    -79        """
    -80        if doc_id is not None:
    -81            self.doc_id = doc_id
    -82
    -83        if facets is not None:
    -84            self.facets = facets
    -85
    -86        if score is not None:
    -87            self.score = score
    -88
    -89        if facet is not None and facet_value is not None:
    -90            self.facets[facet] = facet_value
    -91
    -92        return self
    -
    - - -

    Set attributes of the object. Use keyword arguments to do so. Works as a builder class. -doc.set(doc_id="123").set(facets={"title": "my title"})

    - -
    Parameters
    - -
      -
    • doc_id:
    • -
    • facets:
    • -
    • score:
    • -
    • facet:
    • -
    • facet_value:
    • -
    - -
    Returns
    - -
    -
    Returns document object
    -
    -
    -
    - - -
    -
    - -
    -
    @classmethod
    - - def - get_trec_format( cls, ranked_list: List[debeir.interfaces.document.Document], sort=True): - - - -
    - -
    102    @classmethod
    -103    def get_trec_format(cls, ranked_list: List['Document'], sort=True):
    -104        """
    -105        Get the trec format of a list of ranked documents. This function is a generator.
    -106
    -107        :param ranked_list: A list of Document-type objects
    -108        :param sort: Whether to sort the input list in descending order of score.
    -109        """
    -110
    -111        if sort:
    -112            ranked_list.sort(key=lambda doc: doc.score, reverse=True)
    -113
    -114        for document in ranked_list:
    -115            yield document._get_trec_format()
    -
    - - -

    Get the trec format of a list of ranked documents. This function is a generator.

    - -
    Parameters
    - -
      -
    • ranked_list: A list of Document-type objects
    • -
    • sort: Whether to sort the input list in descending order of score.
    • -
    -
    - - -
    -
    -
    - - \ No newline at end of file diff --git a/docs/debeir/interfaces/executor.html b/docs/debeir/interfaces/executor.html deleted file mode 100644 index a72992a..0000000 --- a/docs/debeir/interfaces/executor.html +++ /dev/null @@ -1,943 +0,0 @@ - - - - - - - debeir.interfaces.executor API documentation - - - - - - - - - -
    -
    -

    -debeir.interfaces.executor

    - - - - - - -
      1from typing import Dict, Union, Optional
    -  2
    -  3import loguru
    -  4from elasticsearch import AsyncElasticsearch as Elasticsearch
    -  5
    -  6from debeir.interfaces.query import GenericElasticsearchQuery
    -  7from debeir.engines.elasticsearch.executor import ElasticsearchExecutor
    -  8from debeir.interfaces.config import NIRConfig, GenericConfig
    -  9from debeir.rankers.transformer_sent_encoder import Encoder
    - 10from debeir.utils.scaler import unpack_elasticsearch_scores
    - 11
    - 12
    - 13class GenericElasticsearchExecutor(ElasticsearchExecutor):
    - 14    """
    - 15    Generic Executor class for Elasticsearch
    - 16    """
    - 17    query: GenericElasticsearchQuery
    - 18
    - 19    def __init__(
    - 20        self,
    - 21        topics: Dict[Union[str, int], Dict[str, str]],
    - 22        client: Elasticsearch,
    - 23        index_name: str,
    - 24        output_file: str,
    - 25        query: GenericElasticsearchQuery,
    - 26        encoder: Optional[Encoder] = None,
    - 27        config=None,
    - 28        *args,
    - 29        **kwargs,
    - 30    ):
    - 31        super().__init__(
    - 32            topics,
    - 33            client,
    - 34            index_name,
    - 35            output_file,
    - 36            query,
    - 37            encoder,
    - 38            config=config,
    - 39            *args,
    - 40            **kwargs,
    - 41        )
    - 42
    - 43        self.query_fns = {
    - 44            "query": self.generate_query,
    - 45            "embedding": self.generate_embedding_query,
    - 46        }
    - 47
    - 48    def generate_query(self, topic_num, best_fields=True, **kwargs):
    - 49        """
    - 50        Generates a standard BM25 query given the topic number
    - 51
    - 52        :param topic_num: Query topic number to generate
    - 53        :param best_fields: Whether to use a curated list of fields
    - 54        :param kwargs:
    - 55        :return:
    - 56        """
    - 57        return self.query.generate_query(topic_num, **kwargs)
    - 58
    - 59    #def generate_query_ablation(self, topic_num, **kwargs):
    - 60    #    return self.query.generate_query_ablation(topic_num)
    - 61
    - 62    def generate_embedding_query(
    - 63        self,
    - 64        topic_num,
    - 65        cosine_weights=None,
    - 66        query_weights=None,
    - 67        norm_weight=2.15,
    - 68        automatic_scores=None,
    - 69        **kwargs,
    - 70    ):
    - 71        """
    - 72        Executes an NIR-style query with combined scoring.
    - 73
    - 74        :param topic_num:
    - 75        :param cosine_weights:
    - 76        :param query_weights:
    - 77        :param norm_weight:
    - 78        :param automatic_scores:
    - 79        :param kwargs:
    - 80        :return:
    - 81        """
    - 82        assert self.encoder is not None or self.config.encoder is not None
    - 83
    - 84        if "encoder" not in kwargs:
    - 85            kwargs["encoder"] = self.encoder
    - 86
    - 87        return self.query.generate_query_embedding(
    - 88            topic_num,
    - 89            cosine_weights=cosine_weights,
    - 90            query_weight=query_weights,
    - 91            norm_weight=norm_weight,
    - 92            automatic_scores=automatic_scores,
    - 93            **kwargs,
    - 94        )
    - 95
    - 96    #@apply_config
    - 97    async def execute_query(
    - 98        self, query=None, return_size: int=None, return_id_only: bool=None,
    - 99            topic_num=None, ablation=False, query_type=None,
    -100            **kwargs
    -101    ):
    -102        """
    -103        Executes a query using the underlying elasticsearch client.
    -104
    -105        :param query:
    -106        :param topic_num:
    -107        :param ablation:
    -108        :param query_type:
    -109        :param return_size:
    -110        :param return_id_only:
    -111        :param kwargs:
    -112        :return:
    -113        """
    -114
    -115        if ablation:
    -116            query_type = "ablation"
    -117
    -118        assert query is not None or topic_num is not None
    -119
    -120        if query:
    -121            if return_id_only:
    -122                # query["fields"] = [self.query.id_mapping]
    -123                # query["_source"] = False
    -124                query["_source"] = [self.query.id_mapping]
    -125            res = await self.client.search(
    -126                index=self.index_name, body=query, size=return_size
    -127            )
    -128
    -129            return [query, res]
    -130
    -131        if topic_num:
    -132            loguru.logger.debug(query_type)
    -133            body = self.query_fns[query_type](topic_num=topic_num, **kwargs)
    -134            if return_id_only:
    -135                loguru.logger.debug("Skip")
    -136                body["_source"] = [self.query.id_mapping]
    -137
    -138            loguru.logger.debug(body)
    -139            res = await self.client.search(
    -140                index=self.index_name, body=body, size=return_size
    -141            )
    -142
    -143            return [topic_num, res]
    -144
    -145    async def run_automatic_adjustment(self):
    -146        """
    -147        Get the normalization constant to be used in NIR-style queries for all topics given an initial
    -148        run of BM25 results.
    -149        """
    -150        loguru.logger.info("Running automatic BM25 weight adjustment")
    -151
    -152        # Backup variables temporarily
    -153        #size = self.return_size
    -154        #self.return_size = 1
    -155        #self.return_id_only = True
    -156        #prev_qt = self.config.query_type
    -157        #self.config.query_type = "query"
    -158
    -159        results = await self.run_all_queries(query_type="query",
    -160                                             return_results=True,
    -161                                             return_size=1,
    -162                                             return_id_only=True)
    -163
    -164        results = unpack_elasticsearch_scores(results)
    -165        self.query.set_bm25_scores(results)
    -166
    -167    @classmethod
    -168    def build_from_config(cls, topics: Dict, query_obj: GenericElasticsearchQuery, client,
    -169                          config: GenericConfig, nir_config: NIRConfig):
    -170        """
    -171        Build an query executor engine from a config file.
    -172        """
    -173
    -174        return cls(
    -175            topics=topics,
    -176            client=client,
    -177            config=config,
    -178            index_name=config.index,
    -179            output_file="",
    -180            return_size=nir_config.return_size,
    -181            query=query_obj
    -182        )
    -
    - - -
    -
    - -
    - - class - GenericElasticsearchExecutor(debeir.engines.elasticsearch.executor.ElasticsearchExecutor): - - - -
    - -
     14class GenericElasticsearchExecutor(ElasticsearchExecutor):
    - 15    """
    - 16    Generic Executor class for Elasticsearch
    - 17    """
    - 18    query: GenericElasticsearchQuery
    - 19
    - 20    def __init__(
    - 21        self,
    - 22        topics: Dict[Union[str, int], Dict[str, str]],
    - 23        client: Elasticsearch,
    - 24        index_name: str,
    - 25        output_file: str,
    - 26        query: GenericElasticsearchQuery,
    - 27        encoder: Optional[Encoder] = None,
    - 28        config=None,
    - 29        *args,
    - 30        **kwargs,
    - 31    ):
    - 32        super().__init__(
    - 33            topics,
    - 34            client,
    - 35            index_name,
    - 36            output_file,
    - 37            query,
    - 38            encoder,
    - 39            config=config,
    - 40            *args,
    - 41            **kwargs,
    - 42        )
    - 43
    - 44        self.query_fns = {
    - 45            "query": self.generate_query,
    - 46            "embedding": self.generate_embedding_query,
    - 47        }
    - 48
    - 49    def generate_query(self, topic_num, best_fields=True, **kwargs):
    - 50        """
    - 51        Generates a standard BM25 query given the topic number
    - 52
    - 53        :param topic_num: Query topic number to generate
    - 54        :param best_fields: Whether to use a curated list of fields
    - 55        :param kwargs:
    - 56        :return:
    - 57        """
    - 58        return self.query.generate_query(topic_num, **kwargs)
    - 59
    - 60    #def generate_query_ablation(self, topic_num, **kwargs):
    - 61    #    return self.query.generate_query_ablation(topic_num)
    - 62
    - 63    def generate_embedding_query(
    - 64        self,
    - 65        topic_num,
    - 66        cosine_weights=None,
    - 67        query_weights=None,
    - 68        norm_weight=2.15,
    - 69        automatic_scores=None,
    - 70        **kwargs,
    - 71    ):
    - 72        """
    - 73        Executes an NIR-style query with combined scoring.
    - 74
    - 75        :param topic_num:
    - 76        :param cosine_weights:
    - 77        :param query_weights:
    - 78        :param norm_weight:
    - 79        :param automatic_scores:
    - 80        :param kwargs:
    - 81        :return:
    - 82        """
    - 83        assert self.encoder is not None or self.config.encoder is not None
    - 84
    - 85        if "encoder" not in kwargs:
    - 86            kwargs["encoder"] = self.encoder
    - 87
    - 88        return self.query.generate_query_embedding(
    - 89            topic_num,
    - 90            cosine_weights=cosine_weights,
    - 91            query_weight=query_weights,
    - 92            norm_weight=norm_weight,
    - 93            automatic_scores=automatic_scores,
    - 94            **kwargs,
    - 95        )
    - 96
    - 97    #@apply_config
    - 98    async def execute_query(
    - 99        self, query=None, return_size: int=None, return_id_only: bool=None,
    -100            topic_num=None, ablation=False, query_type=None,
    -101            **kwargs
    -102    ):
    -103        """
    -104        Executes a query using the underlying elasticsearch client.
    -105
    -106        :param query:
    -107        :param topic_num:
    -108        :param ablation:
    -109        :param query_type:
    -110        :param return_size:
    -111        :param return_id_only:
    -112        :param kwargs:
    -113        :return:
    -114        """
    -115
    -116        if ablation:
    -117            query_type = "ablation"
    -118
    -119        assert query is not None or topic_num is not None
    -120
    -121        if query:
    -122            if return_id_only:
    -123                # query["fields"] = [self.query.id_mapping]
    -124                # query["_source"] = False
    -125                query["_source"] = [self.query.id_mapping]
    -126            res = await self.client.search(
    -127                index=self.index_name, body=query, size=return_size
    -128            )
    -129
    -130            return [query, res]
    -131
    -132        if topic_num:
    -133            loguru.logger.debug(query_type)
    -134            body = self.query_fns[query_type](topic_num=topic_num, **kwargs)
    -135            if return_id_only:
    -136                loguru.logger.debug("Skip")
    -137                body["_source"] = [self.query.id_mapping]
    -138
    -139            loguru.logger.debug(body)
    -140            res = await self.client.search(
    -141                index=self.index_name, body=body, size=return_size
    -142            )
    -143
    -144            return [topic_num, res]
    -145
    -146    async def run_automatic_adjustment(self):
    -147        """
    -148        Get the normalization constant to be used in NIR-style queries for all topics given an initial
    -149        run of BM25 results.
    -150        """
    -151        loguru.logger.info("Running automatic BM25 weight adjustment")
    -152
    -153        # Backup variables temporarily
    -154        #size = self.return_size
    -155        #self.return_size = 1
    -156        #self.return_id_only = True
    -157        #prev_qt = self.config.query_type
    -158        #self.config.query_type = "query"
    -159
    -160        results = await self.run_all_queries(query_type="query",
    -161                                             return_results=True,
    -162                                             return_size=1,
    -163                                             return_id_only=True)
    -164
    -165        results = unpack_elasticsearch_scores(results)
    -166        self.query.set_bm25_scores(results)
    -167
    -168    @classmethod
    -169    def build_from_config(cls, topics: Dict, query_obj: GenericElasticsearchQuery, client,
    -170                          config: GenericConfig, nir_config: NIRConfig):
    -171        """
    -172        Build an query executor engine from a config file.
    -173        """
    -174
    -175        return cls(
    -176            topics=topics,
    -177            client=client,
    -178            config=config,
    -179            index_name=config.index,
    -180            output_file="",
    -181            return_size=nir_config.return_size,
    -182            query=query_obj
    -183        )
    -
    - - -

    Generic Executor class for Elasticsearch

    -
    - - -
    - -
    - - GenericElasticsearchExecutor( topics: Dict[Union[str, int], Dict[str, str]], client: elasticsearch.AsyncElasticsearch, index_name: str, output_file: str, query: debeir.interfaces.query.GenericElasticsearchQuery, encoder: Optional[debeir.rankers.transformer_sent_encoder.Encoder] = None, config=None, *args, **kwargs) - - - -
    - -
    20    def __init__(
    -21        self,
    -22        topics: Dict[Union[str, int], Dict[str, str]],
    -23        client: Elasticsearch,
    -24        index_name: str,
    -25        output_file: str,
    -26        query: GenericElasticsearchQuery,
    -27        encoder: Optional[Encoder] = None,
    -28        config=None,
    -29        *args,
    -30        **kwargs,
    -31    ):
    -32        super().__init__(
    -33            topics,
    -34            client,
    -35            index_name,
    -36            output_file,
    -37            query,
    -38            encoder,
    -39            config=config,
    -40            *args,
    -41            **kwargs,
    -42        )
    -43
    -44        self.query_fns = {
    -45            "query": self.generate_query,
    -46            "embedding": self.generate_embedding_query,
    -47        }
    -
    - - - - -
    -
    - -
    - - def - generate_query(self, topic_num, best_fields=True, **kwargs): - - - -
    - -
    49    def generate_query(self, topic_num, best_fields=True, **kwargs):
    -50        """
    -51        Generates a standard BM25 query given the topic number
    -52
    -53        :param topic_num: Query topic number to generate
    -54        :param best_fields: Whether to use a curated list of fields
    -55        :param kwargs:
    -56        :return:
    -57        """
    -58        return self.query.generate_query(topic_num, **kwargs)
    -
    - - -

    Generates a standard BM25 query given the topic number

    - -
    Parameters
    - -
      -
    • topic_num: Query topic number to generate
    • -
    • best_fields: Whether to use a curated list of fields
    • -
    • kwargs:
    • -
    - -
    Returns
    -
    - - -
    -
    - -
    - - def - generate_embedding_query( self, topic_num, cosine_weights=None, query_weights=None, norm_weight=2.15, automatic_scores=None, **kwargs): - - - -
    - -
    63    def generate_embedding_query(
    -64        self,
    -65        topic_num,
    -66        cosine_weights=None,
    -67        query_weights=None,
    -68        norm_weight=2.15,
    -69        automatic_scores=None,
    -70        **kwargs,
    -71    ):
    -72        """
    -73        Executes an NIR-style query with combined scoring.
    -74
    -75        :param topic_num:
    -76        :param cosine_weights:
    -77        :param query_weights:
    -78        :param norm_weight:
    -79        :param automatic_scores:
    -80        :param kwargs:
    -81        :return:
    -82        """
    -83        assert self.encoder is not None or self.config.encoder is not None
    -84
    -85        if "encoder" not in kwargs:
    -86            kwargs["encoder"] = self.encoder
    -87
    -88        return self.query.generate_query_embedding(
    -89            topic_num,
    -90            cosine_weights=cosine_weights,
    -91            query_weight=query_weights,
    -92            norm_weight=norm_weight,
    -93            automatic_scores=automatic_scores,
    -94            **kwargs,
    -95        )
    -
    - - -

    Executes an NIR-style query with combined scoring.

    - -
    Parameters
    - -
      -
    • topic_num:
    • -
    • cosine_weights:
    • -
    • query_weights:
    • -
    • norm_weight:
    • -
    • automatic_scores:
    • -
    • kwargs:
    • -
    - -
    Returns
    -
    - - -
    -
    - -
    - - async def - execute_query( self, query=None, return_size: int = None, return_id_only: bool = None, topic_num=None, ablation=False, query_type=None, **kwargs): - - - -
    - -
     98    async def execute_query(
    - 99        self, query=None, return_size: int=None, return_id_only: bool=None,
    -100            topic_num=None, ablation=False, query_type=None,
    -101            **kwargs
    -102    ):
    -103        """
    -104        Executes a query using the underlying elasticsearch client.
    -105
    -106        :param query:
    -107        :param topic_num:
    -108        :param ablation:
    -109        :param query_type:
    -110        :param return_size:
    -111        :param return_id_only:
    -112        :param kwargs:
    -113        :return:
    -114        """
    -115
    -116        if ablation:
    -117            query_type = "ablation"
    -118
    -119        assert query is not None or topic_num is not None
    -120
    -121        if query:
    -122            if return_id_only:
    -123                # query["fields"] = [self.query.id_mapping]
    -124                # query["_source"] = False
    -125                query["_source"] = [self.query.id_mapping]
    -126            res = await self.client.search(
    -127                index=self.index_name, body=query, size=return_size
    -128            )
    -129
    -130            return [query, res]
    -131
    -132        if topic_num:
    -133            loguru.logger.debug(query_type)
    -134            body = self.query_fns[query_type](topic_num=topic_num, **kwargs)
    -135            if return_id_only:
    -136                loguru.logger.debug("Skip")
    -137                body["_source"] = [self.query.id_mapping]
    -138
    -139            loguru.logger.debug(body)
    -140            res = await self.client.search(
    -141                index=self.index_name, body=body, size=return_size
    -142            )
    -143
    -144            return [topic_num, res]
    -
    - - -

    Execute a query given parameters

    - -
    Parameters
    - -
      -
    • args:
    • -
    • kwargs:
    • -
    -
    - - -
    -
    - -
    - - async def - run_automatic_adjustment(self): - - - -
    - -
    146    async def run_automatic_adjustment(self):
    -147        """
    -148        Get the normalization constant to be used in NIR-style queries for all topics given an initial
    -149        run of BM25 results.
    -150        """
    -151        loguru.logger.info("Running automatic BM25 weight adjustment")
    -152
    -153        # Backup variables temporarily
    -154        #size = self.return_size
    -155        #self.return_size = 1
    -156        #self.return_id_only = True
    -157        #prev_qt = self.config.query_type
    -158        #self.config.query_type = "query"
    -159
    -160        results = await self.run_all_queries(query_type="query",
    -161                                             return_results=True,
    -162                                             return_size=1,
    -163                                             return_id_only=True)
    -164
    -165        results = unpack_elasticsearch_scores(results)
    -166        self.query.set_bm25_scores(results)
    -
    - - -

    Get the normalization constant to be used in NIR-style queries for all topics given an initial -run of BM25 results.

    -
    - - -
    -
    - -
    -
    @classmethod
    - - def - build_from_config( cls, topics: Dict, query_obj: debeir.interfaces.query.GenericElasticsearchQuery, client, config: debeir.interfaces.config.GenericConfig, nir_config: debeir.interfaces.config.NIRConfig): - - - -
    - -
    168    @classmethod
    -169    def build_from_config(cls, topics: Dict, query_obj: GenericElasticsearchQuery, client,
    -170                          config: GenericConfig, nir_config: NIRConfig):
    -171        """
    -172        Build an query executor engine from a config file.
    -173        """
    -174
    -175        return cls(
    -176            topics=topics,
    -177            client=client,
    -178            config=config,
    -179            index_name=config.index,
    -180            output_file="",
    -181            return_size=nir_config.return_size,
    -182            query=query_obj
    -183        )
    -
    - - -

    Build an query executor engine from a config file.

    -
    - - -
    - -
    -
    - - \ No newline at end of file diff --git a/docs/debeir/interfaces/indexer.html b/docs/debeir/interfaces/indexer.html deleted file mode 100644 index 90e8bc9..0000000 --- a/docs/debeir/interfaces/indexer.html +++ /dev/null @@ -1,724 +0,0 @@ - - - - - - - debeir.interfaces.indexer API documentation - - - - - - - - - -
    -
    -

    -debeir.interfaces.indexer

    - - - - - - -
      1import abc
    -  2import threading
    -  3from queue import Queue
    -  4from typing import List
    -  5
    -  6from elasticsearch import Elasticsearch
    -  7
    -  8from debeir.rankers.transformer_sent_encoder import Encoder
    -  9from debeir.utils.utils import remove_excess_whitespace
    - 10
    - 11
    - 12class Indexer:
    - 13    def __init__(self, client):
    - 14        super().__init__()
    - 15        self.client = client
    - 16
    - 17    @abc.abstractmethod
    - 18    def get_field(self, document, field):
    - 19        pass
    - 20
    - 21
    - 22class SemanticElasticsearchIndexer(Indexer, threading.Thread):
    - 23    """
    - 24    Create a NIR-style index, with dense field representations with provided sentence encoder
    - 25    Assumes you've already indexed to start with.
    - 26    """
    - 27
    - 28    def __init__(self, es_client: Elasticsearch, encoder: Encoder, index: str,
    - 29                 fields_to_encode: List[str], queue: Queue):
    - 30        super().__init__(es_client)
    - 31        self.encoder = encoder
    - 32        self.index = index
    - 33        self.fields = fields_to_encode
    - 34        self.q = queue
    - 35        self.update_mappings(self.index, self.fields, self.client)
    - 36
    - 37    @classmethod
    - 38    def update_mappings(self, index, fields, client: Elasticsearch):
    - 39        mapping = {}
    - 40        value = {
    - 41            "type": "dense_vector",
    - 42            "dims": 768
    - 43        }
    - 44
    - 45        for field in fields:
    - 46            mapping[field + "_Embedding"] = value
    - 47            mapping[field + "_Text"] = {"type": "text"}
    - 48
    - 49        client.indices.put_mapping(
    - 50            body={
    - 51                "properties": mapping
    - 52            }, index=index)
    - 53
    - 54    # async def create_index(self, document_itr=None):
    - 55    #    await self._update_mappings()
    - 56
    - 57    #    if document_itr is None:
    - 58    #        document_itr = helpers.async_scan(self.es_client, index=self.index)
    - 59
    - 60    #    bar = tqdm(desc="Indexing", total=35_000)
    - 61
    - 62    #    async for document in document_itr:
    - 63    #        doc = document["_source"]
    - 64    #        await self.index_document(doc)
    - 65
    - 66    #        bar.update(1)
    - 67
    - 68    def get_field(self, document, field):
    - 69        if field not in document:
    - 70            return False
    - 71
    - 72        if "f{field}_Text" in document and document["f{field}_Text"] != 0:
    - 73            return False
    - 74
    - 75        if 'Textblock' in document[field]:
    - 76            return remove_excess_whitespace(document[field]['Textblock'])
    - 77
    - 78        return remove_excess_whitespace(document[field])
    - 79
    - 80    def index_document(self, document):
    - 81        update_doc = {}
    - 82        doc = document["_source"]
    - 83
    - 84        for field in self.fields:
    - 85            text_field = self.get_field(doc, field)
    - 86
    - 87            if text_field:
    - 88                embedding = self.encoder.encode(self.encoder, topic=text_field, disable_cache=True)
    - 89                update_doc[f"{field}_Embedding"] = embedding
    - 90                update_doc[f"{field}_Text"] = text_field
    - 91
    - 92        if update_doc:
    - 93            self.client.update(index=self.index,
    - 94                               id=document['_id'],
    - 95                               doc=update_doc)
    - 96
    - 97    def run(self):
    - 98        while not self.q.empty():
    - 99            document = self.q.get()
    -100            self.index_document(document)
    -
    - - -
    -
    - -
    - - class - Indexer: - - - -
    - -
    13class Indexer:
    -14    def __init__(self, client):
    -15        super().__init__()
    -16        self.client = client
    -17
    -18    @abc.abstractmethod
    -19    def get_field(self, document, field):
    -20        pass
    -
    - - - - -
    - -
    - - Indexer(client) - - - -
    - -
    14    def __init__(self, client):
    -15        super().__init__()
    -16        self.client = client
    -
    - - - - -
    -
    - -
    -
    @abc.abstractmethod
    - - def - get_field(self, document, field): - - - -
    - -
    18    @abc.abstractmethod
    -19    def get_field(self, document, field):
    -20        pass
    -
    - - - - -
    -
    -
    - -
    - - class - SemanticElasticsearchIndexer(Indexer, threading.Thread): - - - -
    - -
     23class SemanticElasticsearchIndexer(Indexer, threading.Thread):
    - 24    """
    - 25    Create a NIR-style index, with dense field representations with provided sentence encoder
    - 26    Assumes you've already indexed to start with.
    - 27    """
    - 28
    - 29    def __init__(self, es_client: Elasticsearch, encoder: Encoder, index: str,
    - 30                 fields_to_encode: List[str], queue: Queue):
    - 31        super().__init__(es_client)
    - 32        self.encoder = encoder
    - 33        self.index = index
    - 34        self.fields = fields_to_encode
    - 35        self.q = queue
    - 36        self.update_mappings(self.index, self.fields, self.client)
    - 37
    - 38    @classmethod
    - 39    def update_mappings(self, index, fields, client: Elasticsearch):
    - 40        mapping = {}
    - 41        value = {
    - 42            "type": "dense_vector",
    - 43            "dims": 768
    - 44        }
    - 45
    - 46        for field in fields:
    - 47            mapping[field + "_Embedding"] = value
    - 48            mapping[field + "_Text"] = {"type": "text"}
    - 49
    - 50        client.indices.put_mapping(
    - 51            body={
    - 52                "properties": mapping
    - 53            }, index=index)
    - 54
    - 55    # async def create_index(self, document_itr=None):
    - 56    #    await self._update_mappings()
    - 57
    - 58    #    if document_itr is None:
    - 59    #        document_itr = helpers.async_scan(self.es_client, index=self.index)
    - 60
    - 61    #    bar = tqdm(desc="Indexing", total=35_000)
    - 62
    - 63    #    async for document in document_itr:
    - 64    #        doc = document["_source"]
    - 65    #        await self.index_document(doc)
    - 66
    - 67    #        bar.update(1)
    - 68
    - 69    def get_field(self, document, field):
    - 70        if field not in document:
    - 71            return False
    - 72
    - 73        if "f{field}_Text" in document and document["f{field}_Text"] != 0:
    - 74            return False
    - 75
    - 76        if 'Textblock' in document[field]:
    - 77            return remove_excess_whitespace(document[field]['Textblock'])
    - 78
    - 79        return remove_excess_whitespace(document[field])
    - 80
    - 81    def index_document(self, document):
    - 82        update_doc = {}
    - 83        doc = document["_source"]
    - 84
    - 85        for field in self.fields:
    - 86            text_field = self.get_field(doc, field)
    - 87
    - 88            if text_field:
    - 89                embedding = self.encoder.encode(self.encoder, topic=text_field, disable_cache=True)
    - 90                update_doc[f"{field}_Embedding"] = embedding
    - 91                update_doc[f"{field}_Text"] = text_field
    - 92
    - 93        if update_doc:
    - 94            self.client.update(index=self.index,
    - 95                               id=document['_id'],
    - 96                               doc=update_doc)
    - 97
    - 98    def run(self):
    - 99        while not self.q.empty():
    -100            document = self.q.get()
    -101            self.index_document(document)
    -
    - - -

    Create a NIR-style index, with dense field representations with provided sentence encoder -Assumes you've already indexed to start with.

    -
    - - -
    - -
    - - SemanticElasticsearchIndexer( es_client: elasticsearch.Elasticsearch, encoder: debeir.rankers.transformer_sent_encoder.Encoder, index: str, fields_to_encode: List[str], queue: queue.Queue) - - - -
    - -
    29    def __init__(self, es_client: Elasticsearch, encoder: Encoder, index: str,
    -30                 fields_to_encode: List[str], queue: Queue):
    -31        super().__init__(es_client)
    -32        self.encoder = encoder
    -33        self.index = index
    -34        self.fields = fields_to_encode
    -35        self.q = queue
    -36        self.update_mappings(self.index, self.fields, self.client)
    -
    - - -

    This constructor should always be called with keyword arguments. Arguments are:

    - -

    group should be None; reserved for future extension when a ThreadGroup -class is implemented.

    - -

    target is the callable object to be invoked by the run() -method. Defaults to None, meaning nothing is called.

    - -

    name is the thread name. By default, a unique name is constructed of -the form "Thread-N" where N is a small decimal number.

    - -

    args is the argument tuple for the target invocation. Defaults to ().

    - -

    kwargs is a dictionary of keyword arguments for the target -invocation. Defaults to {}.

    - -

    If a subclass overrides the constructor, it must make sure to invoke -the base class constructor (Thread.__init__()) before doing anything -else to the thread.

    -
    - - -
    -
    - -
    -
    @classmethod
    - - def - update_mappings(self, index, fields, client: elasticsearch.Elasticsearch): - - - -
    - -
    38    @classmethod
    -39    def update_mappings(self, index, fields, client: Elasticsearch):
    -40        mapping = {}
    -41        value = {
    -42            "type": "dense_vector",
    -43            "dims": 768
    -44        }
    -45
    -46        for field in fields:
    -47            mapping[field + "_Embedding"] = value
    -48            mapping[field + "_Text"] = {"type": "text"}
    -49
    -50        client.indices.put_mapping(
    -51            body={
    -52                "properties": mapping
    -53            }, index=index)
    -
    - - - - -
    -
    - -
    - - def - get_field(self, document, field): - - - -
    - -
    69    def get_field(self, document, field):
    -70        if field not in document:
    -71            return False
    -72
    -73        if "f{field}_Text" in document and document["f{field}_Text"] != 0:
    -74            return False
    -75
    -76        if 'Textblock' in document[field]:
    -77            return remove_excess_whitespace(document[field]['Textblock'])
    -78
    -79        return remove_excess_whitespace(document[field])
    -
    - - - - -
    -
    - -
    - - def - index_document(self, document): - - - -
    - -
    81    def index_document(self, document):
    -82        update_doc = {}
    -83        doc = document["_source"]
    -84
    -85        for field in self.fields:
    -86            text_field = self.get_field(doc, field)
    -87
    -88            if text_field:
    -89                embedding = self.encoder.encode(self.encoder, topic=text_field, disable_cache=True)
    -90                update_doc[f"{field}_Embedding"] = embedding
    -91                update_doc[f"{field}_Text"] = text_field
    -92
    -93        if update_doc:
    -94            self.client.update(index=self.index,
    -95                               id=document['_id'],
    -96                               doc=update_doc)
    -
    - - - - -
    -
    - -
    - - def - run(self): - - - -
    - -
     98    def run(self):
    - 99        while not self.q.empty():
    -100            document = self.q.get()
    -101            self.index_document(document)
    -
    - - -

    Method representing the thread's activity.

    - -

    You may override this method in a subclass. The standard run() method -invokes the callable object passed to the object's constructor as the -target argument, if any, with sequential and keyword arguments taken -from the args and kwargs arguments, respectively.

    -
    - - -
    -
    -
    Inherited Members
    -
    -
    threading.Thread
    -
    start
    -
    join
    -
    name
    -
    ident
    -
    is_alive
    -
    daemon
    -
    isDaemon
    -
    setDaemon
    -
    getName
    -
    setName
    -
    native_id
    - -
    -
    -
    -
    -
    - - \ No newline at end of file diff --git a/docs/debeir/interfaces/parser.html b/docs/debeir/interfaces/parser.html deleted file mode 100644 index 6065e63..0000000 --- a/docs/debeir/interfaces/parser.html +++ /dev/null @@ -1,1079 +0,0 @@ - - - - - - - debeir.interfaces.parser API documentation - - - - - - - - - -
    -
    -

    -debeir.interfaces.parser

    - - - - - - -
      1import abc
    -  2import dataclasses
    -  3from collections import defaultdict
    -  4from dataclasses import dataclass
    -  5import csv
    -  6from typing import Dict, List, Optional
    -  7from xml.etree import ElementTree as ET
    -  8import dill
    -  9import json
    - 10
    - 11import loguru
    - 12import pandas as pd
    - 13
    - 14
    - 15# TODO: Parse fields can come from a config or ID_fields
    - 16# TODO: move _get_topics to private cls method with arguments, and expose get_topics as an instance method.
    - 17
    - 18
    - 19@dataclass(init=False)
    - 20class Parser:
    - 21    """
    - 22    Parser interface
    - 23    """
    - 24
    - 25    parse_fields: List[str]
    - 26
    - 27    @classmethod
    - 28    def normalize(cls, input_dict) -> Dict:
    - 29        """
    - 30        Flatten the dictionary, i.e. from Dict[int, Dict] -> Dict[str, str_or_int]
    - 31
    - 32        :param input_dict:
    - 33        :return:
    - 34        """
    - 35        return pd.io.json.json_normalize(input_dict,
    - 36                                         sep=".").to_dict(orient='records')[0]
    - 37
    - 38    def get_topics(self, path, *args, **kwargs):
    - 39        """
    - 40        Instance method for getting topics, forwards instance self parameters to the _get_topics class method.
    - 41        """
    - 42
    - 43        self_kwargs = vars(self)
    - 44        kwargs.update(self_kwargs)
    - 45
    - 46        return self._get_topics(path, *args, **kwargs)
    - 47
    - 48    @classmethod
    - 49    @abc.abstractmethod
    - 50    def _get_topics(cls, path, *args, **kwargs) -> Dict[int, Dict[str, str]]:
    - 51        raise NotImplementedError
    - 52
    - 53
    - 54@dataclasses.dataclass(init=True)
    - 55class PickleParser(Parser):
    - 56    """
    - 57    Load topics from a pickle file
    - 58    """
    - 59
    - 60    @classmethod
    - 61    def _get_topics(cls, path, *args, **kwargs) -> Dict[int, Dict[str, str]]:
    - 62        return dill.load(path)
    - 63
    - 64
    - 65@dataclasses.dataclass(init=True)
    - 66class XMLParser(Parser):
    - 67    """
    - 68    Load topics from an XML file
    - 69    """
    - 70    topic_field_name: str
    - 71    id_field: str
    - 72    parse_fields: List[str]
    - 73
    - 74    @classmethod
    - 75    def _recurse_to_child_node(cls, node: ET.Element, track: List):
    - 76        """
    - 77        Helper method to get all children nodes for text extraction in an xml.
    - 78
    - 79        :param node: Current node
    - 80        :param track: List to track nodes
    - 81        :return:
    - 82        """
    - 83        if len(node.getchildren()) > 0:
    - 84            for child in node.getchildren():
    - 85                track.append(cls._recurse_to_child_node(child, track))
    - 86
    - 87        return node
    - 88
    - 89    @classmethod
    - 90    def unwrap(cls, doc_dict, key):
    - 91        """
    - 92        Converts defaultdict to dict and list of size 1 to just the element
    - 93
    - 94        :param doc_dict:
    - 95        :param key:
    - 96        """
    - 97        if isinstance(doc_dict[key], defaultdict):
    - 98            doc_dict[key] = dict(doc_dict[key])
    - 99
    -100            for e_key in doc_dict[key]:
    -101                cls.unwrap(doc_dict[key], e_key)
    -102
    -103        if isinstance(doc_dict[key], list):
    -104            if len(doc_dict[key]) == 1:
    -105                doc_dict[key] = doc_dict[key][0]
    -106
    -107    def _get_topics(self, path, *args, **kwargs) -> Dict[int, Dict[str, str]]:
    -108        all_topics = ET.parse(path).getroot()
    -109        qtopics = {}
    -110
    -111        for topic in all_topics.findall(self.topic_field_name):
    -112            _id = topic.attrib[self.id_field]
    -113            if _id.isnumeric():
    -114                _id = int(_id)
    -115
    -116            if self.parse_fields:
    -117                temp = {}
    -118                for field in self.parse_fields:
    -119                    try:
    -120                        temp[field] = topic.find(field).text.strip()
    -121                    except:
    -122                        continue
    -123
    -124                qtopics[_id] = temp
    -125            else:
    -126                #  The topic contains the text
    -127                qtopics[_id] = {"query": topic.text.strip()}
    -128
    -129        return qtopics
    -130
    -131
    -132@dataclasses.dataclass
    -133class CSVParser(Parser):
    -134    """
    -135    Loads topics from a CSV file
    -136    """
    -137    id_field = "id"
    -138    parse_fields = ["text"]
    -139
    -140    def __init__(self, parsed_fields=None):
    -141        if parsed_fields is None:
    -142            self.parsed_fields = ["id", "text"]
    -143
    -144    @classmethod
    -145    def _get_topics(cls, csvfile, dialect="excel",
    -146                    id_field: str=None,
    -147                    parse_fields: List[str]=None,
    -148                    *args, **kwargs) -> Dict[int, Dict[str, str]]:
    -149        topics = {}
    -150
    -151        if isinstance(csvfile, str):
    -152            csvfile = open(csvfile, 'rt')
    -153
    -154        if id_field is None:
    -155            id_field = cls.id_field
    -156
    -157        if parse_fields is None:
    -158            parse_fields = cls.parse_fields
    -159
    -160        reader = csv.DictReader(csvfile, dialect=dialect)
    -161        for row in reader:
    -162            temp = {}
    -163
    -164            for field in parse_fields:
    -165                temp[field] = row[field]
    -166
    -167            topics[row[id_field]] = temp
    -168
    -169        return topics
    -170
    -171
    -172@dataclasses.dataclass(init=True)
    -173class TSVParser(CSVParser):
    -174
    -175    @classmethod
    -176    def _get_topics(cls, tsvfile, *args, **kwargs) -> Dict[int, Dict[str, str]]:
    -177        return CSVParser._get_topics(tsvfile, *args, dialect='excel-tab', **kwargs)
    -178
    -179
    -180@dataclasses.dataclass(init=True)
    -181class JsonLinesParser(Parser):
    -182    """
    -183    Loads topics from a jsonl file,
    -184    a JSON per line
    -185
    -186    Provide parse_fields, id_field and whether to ignore full matches on json keys
    -187    secondary_id appends to the primary id as jsonlines are flattened structure and may contain duplicate ids.
    -188    """
    -189    parse_fields: List[str]
    -190    id_field: str
    -191    ignore_full_match: bool = True
    -192    secondary_id: str = None
    -193
    -194    @classmethod
    -195    def _get_topics(cls, jsonlfile, id_field, parse_fields,
    -196                    ignore_full_match=True, secondary_id=None, *args, **kwargs) -> Dict[str, Dict]:
    -197        with open(jsonlfile, "r") as jsonl_f:
    -198            topics = {}
    -199
    -200            for jsonl in jsonl_f:
    -201                json_dict = json.loads(jsonl)
    -202                _id = json_dict.pop(id_field)
    -203
    -204                if secondary_id:
    -205                    _id = str(_id) + "_" + str(json_dict[secondary_id])
    -206
    -207                for key in list(json_dict.keys()):
    -208                    found = False
    -209                    for _key in parse_fields:
    -210                        if ignore_full_match:
    -211                            if key in _key or key == _key or _key in key:
    -212                                found = True
    -213                        else:
    -214                            if _key == key:
    -215                                found = True
    -216                    if not found:
    -217                        json_dict.pop(key)
    -218
    -219                topics[_id] = json_dict
    -220
    -221        return topics
    -
    - - -
    -
    - -
    -
    @dataclass(init=False)
    - - class - Parser: - - - -
    - -
    20@dataclass(init=False)
    -21class Parser:
    -22    """
    -23    Parser interface
    -24    """
    -25
    -26    parse_fields: List[str]
    -27
    -28    @classmethod
    -29    def normalize(cls, input_dict) -> Dict:
    -30        """
    -31        Flatten the dictionary, i.e. from Dict[int, Dict] -> Dict[str, str_or_int]
    -32
    -33        :param input_dict:
    -34        :return:
    -35        """
    -36        return pd.io.json.json_normalize(input_dict,
    -37                                         sep=".").to_dict(orient='records')[0]
    -38
    -39    def get_topics(self, path, *args, **kwargs):
    -40        """
    -41        Instance method for getting topics, forwards instance self parameters to the _get_topics class method.
    -42        """
    -43
    -44        self_kwargs = vars(self)
    -45        kwargs.update(self_kwargs)
    -46
    -47        return self._get_topics(path, *args, **kwargs)
    -48
    -49    @classmethod
    -50    @abc.abstractmethod
    -51    def _get_topics(cls, path, *args, **kwargs) -> Dict[int, Dict[str, str]]:
    -52        raise NotImplementedError
    -
    - - -

    Parser interface

    -
    - - -
    -
    - - Parser() - - -
    - - - - -
    -
    - -
    -
    @classmethod
    - - def - normalize(cls, input_dict) -> Dict: - - - -
    - -
    28    @classmethod
    -29    def normalize(cls, input_dict) -> Dict:
    -30        """
    -31        Flatten the dictionary, i.e. from Dict[int, Dict] -> Dict[str, str_or_int]
    -32
    -33        :param input_dict:
    -34        :return:
    -35        """
    -36        return pd.io.json.json_normalize(input_dict,
    -37                                         sep=".").to_dict(orient='records')[0]
    -
    - - -

    Flatten the dictionary, i.e. from Dict[int, Dict] -> Dict[str, str_or_int]

    - -
    Parameters
    - -
      -
    • input_dict:
    • -
    - -
    Returns
    -
    - - -
    -
    - -
    - - def - get_topics(self, path, *args, **kwargs): - - - -
    - -
    39    def get_topics(self, path, *args, **kwargs):
    -40        """
    -41        Instance method for getting topics, forwards instance self parameters to the _get_topics class method.
    -42        """
    -43
    -44        self_kwargs = vars(self)
    -45        kwargs.update(self_kwargs)
    -46
    -47        return self._get_topics(path, *args, **kwargs)
    -
    - - -

    Instance method for getting topics, forwards instance self parameters to the _get_topics class method.

    -
    - - -
    -
    -
    - -
    -
    @dataclasses.dataclass(init=True)
    - - class - PickleParser(Parser): - - - -
    - -
    55@dataclasses.dataclass(init=True)
    -56class PickleParser(Parser):
    -57    """
    -58    Load topics from a pickle file
    -59    """
    -60
    -61    @classmethod
    -62    def _get_topics(cls, path, *args, **kwargs) -> Dict[int, Dict[str, str]]:
    -63        return dill.load(path)
    -
    - - -

    Load topics from a pickle file

    -
    - - -
    -
    - - PickleParser(parse_fields: List[str]) - - -
    - - - - -
    -
    -
    Inherited Members
    -
    - -
    -
    -
    -
    - -
    -
    @dataclasses.dataclass(init=True)
    - - class - XMLParser(Parser): - - - -
    - -
     66@dataclasses.dataclass(init=True)
    - 67class XMLParser(Parser):
    - 68    """
    - 69    Load topics from an XML file
    - 70    """
    - 71    topic_field_name: str
    - 72    id_field: str
    - 73    parse_fields: List[str]
    - 74
    - 75    @classmethod
    - 76    def _recurse_to_child_node(cls, node: ET.Element, track: List):
    - 77        """
    - 78        Helper method to get all children nodes for text extraction in an xml.
    - 79
    - 80        :param node: Current node
    - 81        :param track: List to track nodes
    - 82        :return:
    - 83        """
    - 84        if len(node.getchildren()) > 0:
    - 85            for child in node.getchildren():
    - 86                track.append(cls._recurse_to_child_node(child, track))
    - 87
    - 88        return node
    - 89
    - 90    @classmethod
    - 91    def unwrap(cls, doc_dict, key):
    - 92        """
    - 93        Converts defaultdict to dict and list of size 1 to just the element
    - 94
    - 95        :param doc_dict:
    - 96        :param key:
    - 97        """
    - 98        if isinstance(doc_dict[key], defaultdict):
    - 99            doc_dict[key] = dict(doc_dict[key])
    -100
    -101            for e_key in doc_dict[key]:
    -102                cls.unwrap(doc_dict[key], e_key)
    -103
    -104        if isinstance(doc_dict[key], list):
    -105            if len(doc_dict[key]) == 1:
    -106                doc_dict[key] = doc_dict[key][0]
    -107
    -108    def _get_topics(self, path, *args, **kwargs) -> Dict[int, Dict[str, str]]:
    -109        all_topics = ET.parse(path).getroot()
    -110        qtopics = {}
    -111
    -112        for topic in all_topics.findall(self.topic_field_name):
    -113            _id = topic.attrib[self.id_field]
    -114            if _id.isnumeric():
    -115                _id = int(_id)
    -116
    -117            if self.parse_fields:
    -118                temp = {}
    -119                for field in self.parse_fields:
    -120                    try:
    -121                        temp[field] = topic.find(field).text.strip()
    -122                    except:
    -123                        continue
    -124
    -125                qtopics[_id] = temp
    -126            else:
    -127                #  The topic contains the text
    -128                qtopics[_id] = {"query": topic.text.strip()}
    -129
    -130        return qtopics
    -
    - - -

    Load topics from an XML file

    -
    - - -
    -
    - - XMLParser(parse_fields: List[str], topic_field_name: str, id_field: str) - - -
    - - - - -
    -
    - -
    -
    @classmethod
    - - def - unwrap(cls, doc_dict, key): - - - -
    - -
     90    @classmethod
    - 91    def unwrap(cls, doc_dict, key):
    - 92        """
    - 93        Converts defaultdict to dict and list of size 1 to just the element
    - 94
    - 95        :param doc_dict:
    - 96        :param key:
    - 97        """
    - 98        if isinstance(doc_dict[key], defaultdict):
    - 99            doc_dict[key] = dict(doc_dict[key])
    -100
    -101            for e_key in doc_dict[key]:
    -102                cls.unwrap(doc_dict[key], e_key)
    -103
    -104        if isinstance(doc_dict[key], list):
    -105            if len(doc_dict[key]) == 1:
    -106                doc_dict[key] = doc_dict[key][0]
    -
    - - -

    Converts defaultdict to dict and list of size 1 to just the element

    - -
    Parameters
    - -
      -
    • doc_dict:
    • -
    • key:
    • -
    -
    - - -
    -
    -
    Inherited Members
    -
    - -
    -
    -
    -
    - -
    -
    @dataclasses.dataclass
    - - class - CSVParser(Parser): - - - -
    - -
    133@dataclasses.dataclass
    -134class CSVParser(Parser):
    -135    """
    -136    Loads topics from a CSV file
    -137    """
    -138    id_field = "id"
    -139    parse_fields = ["text"]
    -140
    -141    def __init__(self, parsed_fields=None):
    -142        if parsed_fields is None:
    -143            self.parsed_fields = ["id", "text"]
    -144
    -145    @classmethod
    -146    def _get_topics(cls, csvfile, dialect="excel",
    -147                    id_field: str=None,
    -148                    parse_fields: List[str]=None,
    -149                    *args, **kwargs) -> Dict[int, Dict[str, str]]:
    -150        topics = {}
    -151
    -152        if isinstance(csvfile, str):
    -153            csvfile = open(csvfile, 'rt')
    -154
    -155        if id_field is None:
    -156            id_field = cls.id_field
    -157
    -158        if parse_fields is None:
    -159            parse_fields = cls.parse_fields
    -160
    -161        reader = csv.DictReader(csvfile, dialect=dialect)
    -162        for row in reader:
    -163            temp = {}
    -164
    -165            for field in parse_fields:
    -166                temp[field] = row[field]
    -167
    -168            topics[row[id_field]] = temp
    -169
    -170        return topics
    -
    - - -

    Loads topics from a CSV file

    -
    - - -
    - -
    - - CSVParser(parsed_fields=None) - - - -
    - -
    141    def __init__(self, parsed_fields=None):
    -142        if parsed_fields is None:
    -143            self.parsed_fields = ["id", "text"]
    -
    - - - - -
    -
    -
    Inherited Members
    -
    - -
    -
    -
    -
    - -
    -
    @dataclasses.dataclass(init=True)
    - - class - TSVParser(CSVParser): - - - -
    - -
    173@dataclasses.dataclass(init=True)
    -174class TSVParser(CSVParser):
    -175
    -176    @classmethod
    -177    def _get_topics(cls, tsvfile, *args, **kwargs) -> Dict[int, Dict[str, str]]:
    -178        return CSVParser._get_topics(tsvfile, *args, dialect='excel-tab', **kwargs)
    -
    - - - - -
    -
    - - TSVParser(parse_fields: List[str]) - - -
    - - - - -
    -
    -
    Inherited Members
    -
    - -
    -
    -
    -
    - -
    -
    @dataclasses.dataclass(init=True)
    - - class - JsonLinesParser(Parser): - - - -
    - -
    181@dataclasses.dataclass(init=True)
    -182class JsonLinesParser(Parser):
    -183    """
    -184    Loads topics from a jsonl file,
    -185    a JSON per line
    -186
    -187    Provide parse_fields, id_field and whether to ignore full matches on json keys
    -188    secondary_id appends to the primary id as jsonlines are flattened structure and may contain duplicate ids.
    -189    """
    -190    parse_fields: List[str]
    -191    id_field: str
    -192    ignore_full_match: bool = True
    -193    secondary_id: str = None
    -194
    -195    @classmethod
    -196    def _get_topics(cls, jsonlfile, id_field, parse_fields,
    -197                    ignore_full_match=True, secondary_id=None, *args, **kwargs) -> Dict[str, Dict]:
    -198        with open(jsonlfile, "r") as jsonl_f:
    -199            topics = {}
    -200
    -201            for jsonl in jsonl_f:
    -202                json_dict = json.loads(jsonl)
    -203                _id = json_dict.pop(id_field)
    -204
    -205                if secondary_id:
    -206                    _id = str(_id) + "_" + str(json_dict[secondary_id])
    -207
    -208                for key in list(json_dict.keys()):
    -209                    found = False
    -210                    for _key in parse_fields:
    -211                        if ignore_full_match:
    -212                            if key in _key or key == _key or _key in key:
    -213                                found = True
    -214                        else:
    -215                            if _key == key:
    -216                                found = True
    -217                    if not found:
    -218                        json_dict.pop(key)
    -219
    -220                topics[_id] = json_dict
    -221
    -222        return topics
    -
    - - -

    Loads topics from a jsonl file, -a JSON per line

    - -

    Provide parse_fields, id_field and whether to ignore full matches on json keys -secondary_id appends to the primary id as jsonlines are flattened structure and may contain duplicate ids.

    -
    - - -
    -
    - - JsonLinesParser( parse_fields: List[str], id_field: str, ignore_full_match: bool = True, secondary_id: str = None) - - -
    - - - - -
    -
    -
    Inherited Members
    -
    - -
    -
    -
    -
    - - \ No newline at end of file diff --git a/docs/debeir/interfaces/pipeline.html b/docs/debeir/interfaces/pipeline.html deleted file mode 100644 index 0764e1b..0000000 --- a/docs/debeir/interfaces/pipeline.html +++ /dev/null @@ -1,798 +0,0 @@ - - - - - - - debeir.interfaces.pipeline API documentation - - - - - - - - - -
    -
    -

    -debeir.interfaces.pipeline

    - - - - - - -
      1import abc
    -  2
    -  3import debeir
    -  4from loguru import logger
    -  5from typing import List
    -  6
    -  7from debeir.engines.client import Client
    -  8from debeir.data_sets.factory import factory_fn, get_nir_config
    -  9from debeir.interfaces.executor import GenericElasticsearchExecutor
    - 10from debeir.interfaces.config import Config, _NIRMasterConfig
    - 11from debeir.interfaces.config import GenericConfig
    - 12
    - 13
    - 14class Pipeline:
    - 15    pipeline_structure = ["parser", "query", "engine", "evaluator"]
    - 16    cannot_disable = ["parser", "query", "engine"]
    - 17    callbacks: List['debeir.interfaces.callbacks.Callback']
    - 18    output_file = None
    - 19
    - 20    def __init__(self, engine: GenericElasticsearchExecutor,
    - 21                 metrics_config,
    - 22                 engine_config,
    - 23                 nir_config,
    - 24                 run_config: Config,
    - 25                 callbacks = None):
    - 26
    - 27        self.engine = engine
    - 28        self.run_config = run_config
    - 29        self.metrics_config = metrics_config
    - 30        self.engine_config = engine_config
    - 31        self.nir_config = nir_config
    - 32        self.output_file = None
    - 33        self.disable = {}
    - 34
    - 35        if callbacks is None:
    - 36            self.callbacks = []
    - 37        else:
    - 38            self.callbacks = callbacks
    - 39
    - 40    @classmethod
    - 41    def build_from_config(cls, nir_config_fp, engine, config_fp) -> 'Pipeline':
    - 42        query_cls, config, parser, executor_cls = factory_fn(config_fp)
    - 43
    - 44        nir_config, search_engine_config, metrics_config = get_nir_config(nir_config_fp,
    - 45                                                                          engine=engine,
    - 46                                                                          ignore_errors=False)
    - 47
    - 48        client = Client.build_from_config(engine, search_engine_config)
    - 49        topics = parser._get_topics(config.topics_path)
    - 50
    - 51        query = query_cls(topics=topics, query_type=config.query_type, config=config)
    - 52
    - 53        executor = executor_cls.build_from_config(
    - 54            topics,
    - 55            query,
    - 56            client.get_client(engine),
    - 57            config,
    - 58            nir_config
    - 59        )
    - 60
    - 61        return cls(
    - 62            executor,
    - 63            metrics_config,
    - 64            search_engine_config,
    - 65            nir_config,
    - 66            config
    - 67        )
    - 68
    - 69    def disable(self, parts: list):
    - 70        for part in parts:
    - 71            if part in self.pipeline_structure and part not in self.cannot_disable:
    - 72                self.disable[part] = True
    - 73            else:
    - 74                logger.warning(f"Cannot disable {part} because it doesn't exist or is integral to the pipeline")
    - 75
    - 76    @abc.abstractmethod
    - 77    async def run_pipeline(self, *args,
    - 78                           **kwargs):
    - 79        raise NotImplementedError()
    - 80
    - 81
    - 82class NIRPipeline(Pipeline):
    - 83    run_config: GenericConfig
    - 84
    - 85    def __init__(self, *args, **kwargs):
    - 86        super().__init__(*args, **kwargs)
    - 87
    - 88    async def prehook(self):
    - 89        if self.run_config.automatic or self.run_config.norm_weight == "automatic":
    - 90            logger.info(f"Running initial BM25 for query adjustment")
    - 91            await self.engine.run_automatic_adjustment()
    - 92
    - 93    async def run_engine(self, *args, **kwargs):
    - 94        # Run bm25 nir adjustment
    - 95        logger.info(f"Running {self.run_config.query_type} queries")
    - 96
    - 97        return await self.engine.run_all_queries(*args, return_results=True, **kwargs)
    - 98
    - 99    async def posthook(self, *args, **kwargs):
    -100        pass
    -101
    -102    async def run_pipeline(self, *args, return_results=False, **kwargs):
    -103        for cb in self.callbacks:
    -104            cb.before(self)
    -105
    -106        await self.prehook()
    -107        results = await self.run_engine(*args, **kwargs)
    -108
    -109        for cb in self.callbacks:
    -110            cb.after(results)
    -111
    -112        if return_results:
    -113            return results
    -114
    -115    def register_callback(self, cb):
    -116        self.callbacks.append(cb)
    -
    - - -
    -
    - -
    - - class - Pipeline: - - - -
    - -
    15class Pipeline:
    -16    pipeline_structure = ["parser", "query", "engine", "evaluator"]
    -17    cannot_disable = ["parser", "query", "engine"]
    -18    callbacks: List['debeir.interfaces.callbacks.Callback']
    -19    output_file = None
    -20
    -21    def __init__(self, engine: GenericElasticsearchExecutor,
    -22                 metrics_config,
    -23                 engine_config,
    -24                 nir_config,
    -25                 run_config: Config,
    -26                 callbacks = None):
    -27
    -28        self.engine = engine
    -29        self.run_config = run_config
    -30        self.metrics_config = metrics_config
    -31        self.engine_config = engine_config
    -32        self.nir_config = nir_config
    -33        self.output_file = None
    -34        self.disable = {}
    -35
    -36        if callbacks is None:
    -37            self.callbacks = []
    -38        else:
    -39            self.callbacks = callbacks
    -40
    -41    @classmethod
    -42    def build_from_config(cls, nir_config_fp, engine, config_fp) -> 'Pipeline':
    -43        query_cls, config, parser, executor_cls = factory_fn(config_fp)
    -44
    -45        nir_config, search_engine_config, metrics_config = get_nir_config(nir_config_fp,
    -46                                                                          engine=engine,
    -47                                                                          ignore_errors=False)
    -48
    -49        client = Client.build_from_config(engine, search_engine_config)
    -50        topics = parser._get_topics(config.topics_path)
    -51
    -52        query = query_cls(topics=topics, query_type=config.query_type, config=config)
    -53
    -54        executor = executor_cls.build_from_config(
    -55            topics,
    -56            query,
    -57            client.get_client(engine),
    -58            config,
    -59            nir_config
    -60        )
    -61
    -62        return cls(
    -63            executor,
    -64            metrics_config,
    -65            search_engine_config,
    -66            nir_config,
    -67            config
    -68        )
    -69
    -70    def disable(self, parts: list):
    -71        for part in parts:
    -72            if part in self.pipeline_structure and part not in self.cannot_disable:
    -73                self.disable[part] = True
    -74            else:
    -75                logger.warning(f"Cannot disable {part} because it doesn't exist or is integral to the pipeline")
    -76
    -77    @abc.abstractmethod
    -78    async def run_pipeline(self, *args,
    -79                           **kwargs):
    -80        raise NotImplementedError()
    -
    - - - - -
    - -
    - - Pipeline( engine: debeir.interfaces.executor.GenericElasticsearchExecutor, metrics_config, engine_config, nir_config, run_config: debeir.interfaces.config.Config, callbacks=None) - - - -
    - -
    21    def __init__(self, engine: GenericElasticsearchExecutor,
    -22                 metrics_config,
    -23                 engine_config,
    -24                 nir_config,
    -25                 run_config: Config,
    -26                 callbacks = None):
    -27
    -28        self.engine = engine
    -29        self.run_config = run_config
    -30        self.metrics_config = metrics_config
    -31        self.engine_config = engine_config
    -32        self.nir_config = nir_config
    -33        self.output_file = None
    -34        self.disable = {}
    -35
    -36        if callbacks is None:
    -37            self.callbacks = []
    -38        else:
    -39            self.callbacks = callbacks
    -
    - - - - -
    -
    - -
    - - def - disable(self, parts: list): - - - -
    - -
    70    def disable(self, parts: list):
    -71        for part in parts:
    -72            if part in self.pipeline_structure and part not in self.cannot_disable:
    -73                self.disable[part] = True
    -74            else:
    -75                logger.warning(f"Cannot disable {part} because it doesn't exist or is integral to the pipeline")
    -
    - - - - -
    -
    - -
    -
    @classmethod
    - - def - build_from_config( cls, nir_config_fp, engine, config_fp) -> debeir.interfaces.pipeline.Pipeline: - - - -
    - -
    41    @classmethod
    -42    def build_from_config(cls, nir_config_fp, engine, config_fp) -> 'Pipeline':
    -43        query_cls, config, parser, executor_cls = factory_fn(config_fp)
    -44
    -45        nir_config, search_engine_config, metrics_config = get_nir_config(nir_config_fp,
    -46                                                                          engine=engine,
    -47                                                                          ignore_errors=False)
    -48
    -49        client = Client.build_from_config(engine, search_engine_config)
    -50        topics = parser._get_topics(config.topics_path)
    -51
    -52        query = query_cls(topics=topics, query_type=config.query_type, config=config)
    -53
    -54        executor = executor_cls.build_from_config(
    -55            topics,
    -56            query,
    -57            client.get_client(engine),
    -58            config,
    -59            nir_config
    -60        )
    -61
    -62        return cls(
    -63            executor,
    -64            metrics_config,
    -65            search_engine_config,
    -66            nir_config,
    -67            config
    -68        )
    -
    - - - - -
    -
    - -
    -
    @abc.abstractmethod
    - - async def - run_pipeline(self, *args, **kwargs): - - - -
    - -
    77    @abc.abstractmethod
    -78    async def run_pipeline(self, *args,
    -79                           **kwargs):
    -80        raise NotImplementedError()
    -
    - - - - -
    -
    -
    - -
    - - class - NIRPipeline(Pipeline): - - - -
    - -
     83class NIRPipeline(Pipeline):
    - 84    run_config: GenericConfig
    - 85
    - 86    def __init__(self, *args, **kwargs):
    - 87        super().__init__(*args, **kwargs)
    - 88
    - 89    async def prehook(self):
    - 90        if self.run_config.automatic or self.run_config.norm_weight == "automatic":
    - 91            logger.info(f"Running initial BM25 for query adjustment")
    - 92            await self.engine.run_automatic_adjustment()
    - 93
    - 94    async def run_engine(self, *args, **kwargs):
    - 95        # Run bm25 nir adjustment
    - 96        logger.info(f"Running {self.run_config.query_type} queries")
    - 97
    - 98        return await self.engine.run_all_queries(*args, return_results=True, **kwargs)
    - 99
    -100    async def posthook(self, *args, **kwargs):
    -101        pass
    -102
    -103    async def run_pipeline(self, *args, return_results=False, **kwargs):
    -104        for cb in self.callbacks:
    -105            cb.before(self)
    -106
    -107        await self.prehook()
    -108        results = await self.run_engine(*args, **kwargs)
    -109
    -110        for cb in self.callbacks:
    -111            cb.after(results)
    -112
    -113        if return_results:
    -114            return results
    -115
    -116    def register_callback(self, cb):
    -117        self.callbacks.append(cb)
    -
    - - - - -
    - -
    - - NIRPipeline(*args, **kwargs) - - - -
    - -
    86    def __init__(self, *args, **kwargs):
    -87        super().__init__(*args, **kwargs)
    -
    - - - - -
    -
    - -
    - - async def - prehook(self): - - - -
    - -
    89    async def prehook(self):
    -90        if self.run_config.automatic or self.run_config.norm_weight == "automatic":
    -91            logger.info(f"Running initial BM25 for query adjustment")
    -92            await self.engine.run_automatic_adjustment()
    -
    - - - - -
    -
    - -
    - - async def - run_engine(self, *args, **kwargs): - - - -
    - -
    94    async def run_engine(self, *args, **kwargs):
    -95        # Run bm25 nir adjustment
    -96        logger.info(f"Running {self.run_config.query_type} queries")
    -97
    -98        return await self.engine.run_all_queries(*args, return_results=True, **kwargs)
    -
    - - - - -
    -
    - -
    - - async def - posthook(self, *args, **kwargs): - - - -
    - -
    100    async def posthook(self, *args, **kwargs):
    -101        pass
    -
    - - - - -
    -
    - -
    - - async def - run_pipeline(self, *args, return_results=False, **kwargs): - - - -
    - -
    103    async def run_pipeline(self, *args, return_results=False, **kwargs):
    -104        for cb in self.callbacks:
    -105            cb.before(self)
    -106
    -107        await self.prehook()
    -108        results = await self.run_engine(*args, **kwargs)
    -109
    -110        for cb in self.callbacks:
    -111            cb.after(results)
    -112
    -113        if return_results:
    -114            return results
    -
    - - - - -
    -
    - -
    - - def - register_callback(self, cb): - - - -
    - -
    116    def register_callback(self, cb):
    -117        self.callbacks.append(cb)
    -
    - - - - -
    -
    -
    Inherited Members
    -
    - -
    -
    -
    -
    - - \ No newline at end of file diff --git a/docs/debeir/interfaces/query.html b/docs/debeir/interfaces/query.html deleted file mode 100644 index 2cf4edf..0000000 --- a/docs/debeir/interfaces/query.html +++ /dev/null @@ -1,940 +0,0 @@ - - - - - - - debeir.interfaces.query API documentation - - - - - - - - - -
    -
    -

    -debeir.interfaces.query

    - - - - - - -
      1import dataclasses
    -  2
    -  3import loguru
    -  4
    -  5from typing import Dict, Union, Optional
    -  6
    -  7from debeir.interfaces.config import apply_config, GenericConfig
    -  8from debeir.engines.elasticsearch.generate_script_score import generate_script
    -  9from debeir.utils.scaler import get_z_value
    - 10
    - 11
    - 12@dataclasses.dataclass(init=True)
    - 13class Query:
    - 14    """
    - 15    A query interface class
    - 16    :param topics: Topics that the query will be composed of
    - 17    :param config: Config object that contains the settings for querying
    - 18    """
    - 19    topics: Dict[int, Dict[str, str]]
    - 20    config: GenericConfig
    - 21
    - 22
    - 23class GenericElasticsearchQuery(Query):
    - 24    """
    - 25    A generic elasticsearch query. Contains methods for NIR-style (embedding) queries and normal BM25 queries.
    - 26    Requires topics, configs to be included
    - 27    """
    - 28    id_mapping: str = "Id"
    - 29
    - 30    def __init__(self, topics, config, top_bm25_scores=None, mappings=None, id_mapping=None, *args, **kwargs):
    - 31        super().__init__(topics, config)
    - 32
    - 33        if id_mapping is None:
    - 34            self.id_mapping = "id"
    - 35
    - 36        if mappings is None:
    - 37            self.mappings = ["Text"]
    - 38        else:
    - 39            self.mappings = mappings
    - 40
    - 41        self.topics = topics
    - 42        self.config = config
    - 43        self.query_type = self.config.query_type
    - 44
    - 45        self.embed_mappings = ["Text_Embedding"]
    - 46
    - 47        self.query_funcs = {
    - 48            "query": self.generate_query,
    - 49            "embedding": self.generate_query_embedding,
    - 50        }
    - 51
    - 52        self.top_bm25_scores = top_bm25_scores
    - 53
    - 54    def _generate_base_query(self, topic_num):
    - 55        qfield = list(self.topics[topic_num].keys())[0]
    - 56        query = self.topics[topic_num][qfield]
    - 57        should = {"should": []}
    - 58
    - 59        for i, field in enumerate(self.mappings):
    - 60            should["should"].append(
    - 61                {
    - 62                    "match": {
    - 63                        f"{field}": {
    - 64                            "query": query,
    - 65                        }
    - 66                    }
    - 67                }
    - 68            )
    - 69
    - 70        return qfield, query, should
    - 71
    - 72    def generate_query(self, topic_num, *args, **kwargs):
    - 73        """
    - 74        Generates a simple BM25 query based off the query facets. Searches over all the document facets.
    - 75        :param topic_num:
    - 76        :param args:
    - 77        :param kwargs:
    - 78        :return:
    - 79        """
    - 80        _, _, should = self._generate_base_query(topic_num)
    - 81
    - 82        query = {
    - 83            "query": {
    - 84                "bool": should,
    - 85            }
    - 86        }
    - 87
    - 88        return query
    - 89
    - 90    def set_bm25_scores(self, scores: Dict[Union[str, int], Union[int, float]]):
    - 91        """
    - 92        Sets BM25 scores that are used for NIR-style scoring. The top BM25 score for each topic is used
    - 93        for log normalization.
    - 94
    - 95        Score = log(bm25)/log(z) + embed_score
    - 96        :param scores: Top BM25 Scores of the form {topic_num: top_bm25_score}
    - 97        """
    - 98        self.top_bm25_scores = scores
    - 99
    -100    def has_bm25_scores(self):
    -101        """
    -102        Checks if BM25 scores have been set
    -103        :return:
    -104        """
    -105        return self.top_bm25_scores is not None
    -106
    -107    @apply_config
    -108    def generate_query_embedding(
    -109        self, topic_num, encoder, *args, norm_weight=2.15, ablations=False, cosine_ceiling=Optional[float],
    -110            cosine_offset: float = 1.0, **kwargs):
    -111        """
    -112        Generates an embedding script score query for Elasticsearch as part of the NIR scoring function.
    -113
    -114        :param topic_num: The topic number to search for
    -115        :param encoder: The encoder that will be used for encoding the topics
    -116        :param norm_weight: The BM25 log normalization constant
    -117        :param ablations: Whether to execute ablation style queries (i.e. one query facet
    -118                          or one document facet at a time)
    -119        :param cosine_ceiling: Cosine ceiling used for automatic z-log normalization parameter calculation
    -120        :param args:
    -121        :param kwargs: Pass disable_cache to disable encoder caching
    -122        :return:
    -123            An elasticsearch script_score query
    -124        """
    -125
    -126        qfields = list(self.topics[topic_num].keys())
    -127        should = {"should": []}
    -128
    -129        if self.has_bm25_scores():
    -130            cosine_ceiling = len(self.embed_mappings) * len(qfields) if cosine_ceiling is None else cosine_ceiling
    -131            norm_weight = get_z_value(
    -132                cosine_ceiling=cosine_ceiling,
    -133                bm25_ceiling=self.top_bm25_scores[topic_num],
    -134            )
    -135            loguru.logger.debug(f"Automatic norm_weight: {norm_weight}")
    -136
    -137        params = {
    -138            "weights": [1] * (len(self.embed_mappings) * len(self.mappings)),
    -139            "offset": cosine_offset,
    -140            "norm_weight": norm_weight,
    -141            "disable_bm25": ablations,
    -142        }
    -143
    -144        embed_fields = []
    -145
    -146        for qfield in qfields:
    -147            for field in self.mappings:
    -148                should["should"].append(
    -149                    {
    -150                        "match": {
    -151                            f"{field}": {
    -152                                "query": self.topics[topic_num][qfield],
    -153                            }
    -154                        }
    -155                    }
    -156                )
    -157
    -158            params[f"{qfield}_eb"] = encoder.encode(
    -159                encoder, topic=self.topics[topic_num][qfield]
    -160            )
    -161            embed_fields.append(f"{qfield}_eb")
    -162
    -163        query = {
    -164            "query": {
    -165                "script_score": {
    -166                    "query": {
    -167                        "bool": should,
    -168                    },
    -169                    "script": generate_script(
    -170                        self.embed_mappings, params, qfields=embed_fields
    -171                    ),
    -172                }
    -173            }
    -174        }
    -175
    -176        loguru.logger.debug(query)
    -177        return query
    -178
    -179    @classmethod
    -180    def get_id_mapping(cls, hit):
    -181        """
    -182        Get the document ID
    -183
    -184        :param hit: The raw document result
    -185        :return:
    -186            The document's ID
    -187        """
    -188        return hit[cls.id_mapping]
    -
    - - -
    -
    - -
    -
    @dataclasses.dataclass(init=True)
    - - class - Query: - - - -
    - -
    13@dataclasses.dataclass(init=True)
    -14class Query:
    -15    """
    -16    A query interface class
    -17    :param topics: Topics that the query will be composed of
    -18    :param config: Config object that contains the settings for querying
    -19    """
    -20    topics: Dict[int, Dict[str, str]]
    -21    config: GenericConfig
    -
    - - -

    A query interface class

    - -
    Parameters
    - -
      -
    • topics: Topics that the query will be composed of
    • -
    • config: Config object that contains the settings for querying
    • -
    -
    - - -
    -
    - - Query( topics: Dict[int, Dict[str, str]], config: debeir.interfaces.config.GenericConfig) - - -
    - - - - -
    -
    -
    - -
    - - class - GenericElasticsearchQuery(Query): - - - -
    - -
     24class GenericElasticsearchQuery(Query):
    - 25    """
    - 26    A generic elasticsearch query. Contains methods for NIR-style (embedding) queries and normal BM25 queries.
    - 27    Requires topics, configs to be included
    - 28    """
    - 29    id_mapping: str = "Id"
    - 30
    - 31    def __init__(self, topics, config, top_bm25_scores=None, mappings=None, id_mapping=None, *args, **kwargs):
    - 32        super().__init__(topics, config)
    - 33
    - 34        if id_mapping is None:
    - 35            self.id_mapping = "id"
    - 36
    - 37        if mappings is None:
    - 38            self.mappings = ["Text"]
    - 39        else:
    - 40            self.mappings = mappings
    - 41
    - 42        self.topics = topics
    - 43        self.config = config
    - 44        self.query_type = self.config.query_type
    - 45
    - 46        self.embed_mappings = ["Text_Embedding"]
    - 47
    - 48        self.query_funcs = {
    - 49            "query": self.generate_query,
    - 50            "embedding": self.generate_query_embedding,
    - 51        }
    - 52
    - 53        self.top_bm25_scores = top_bm25_scores
    - 54
    - 55    def _generate_base_query(self, topic_num):
    - 56        qfield = list(self.topics[topic_num].keys())[0]
    - 57        query = self.topics[topic_num][qfield]
    - 58        should = {"should": []}
    - 59
    - 60        for i, field in enumerate(self.mappings):
    - 61            should["should"].append(
    - 62                {
    - 63                    "match": {
    - 64                        f"{field}": {
    - 65                            "query": query,
    - 66                        }
    - 67                    }
    - 68                }
    - 69            )
    - 70
    - 71        return qfield, query, should
    - 72
    - 73    def generate_query(self, topic_num, *args, **kwargs):
    - 74        """
    - 75        Generates a simple BM25 query based off the query facets. Searches over all the document facets.
    - 76        :param topic_num:
    - 77        :param args:
    - 78        :param kwargs:
    - 79        :return:
    - 80        """
    - 81        _, _, should = self._generate_base_query(topic_num)
    - 82
    - 83        query = {
    - 84            "query": {
    - 85                "bool": should,
    - 86            }
    - 87        }
    - 88
    - 89        return query
    - 90
    - 91    def set_bm25_scores(self, scores: Dict[Union[str, int], Union[int, float]]):
    - 92        """
    - 93        Sets BM25 scores that are used for NIR-style scoring. The top BM25 score for each topic is used
    - 94        for log normalization.
    - 95
    - 96        Score = log(bm25)/log(z) + embed_score
    - 97        :param scores: Top BM25 Scores of the form {topic_num: top_bm25_score}
    - 98        """
    - 99        self.top_bm25_scores = scores
    -100
    -101    def has_bm25_scores(self):
    -102        """
    -103        Checks if BM25 scores have been set
    -104        :return:
    -105        """
    -106        return self.top_bm25_scores is not None
    -107
    -108    @apply_config
    -109    def generate_query_embedding(
    -110        self, topic_num, encoder, *args, norm_weight=2.15, ablations=False, cosine_ceiling=Optional[float],
    -111            cosine_offset: float = 1.0, **kwargs):
    -112        """
    -113        Generates an embedding script score query for Elasticsearch as part of the NIR scoring function.
    -114
    -115        :param topic_num: The topic number to search for
    -116        :param encoder: The encoder that will be used for encoding the topics
    -117        :param norm_weight: The BM25 log normalization constant
    -118        :param ablations: Whether to execute ablation style queries (i.e. one query facet
    -119                          or one document facet at a time)
    -120        :param cosine_ceiling: Cosine ceiling used for automatic z-log normalization parameter calculation
    -121        :param args:
    -122        :param kwargs: Pass disable_cache to disable encoder caching
    -123        :return:
    -124            An elasticsearch script_score query
    -125        """
    -126
    -127        qfields = list(self.topics[topic_num].keys())
    -128        should = {"should": []}
    -129
    -130        if self.has_bm25_scores():
    -131            cosine_ceiling = len(self.embed_mappings) * len(qfields) if cosine_ceiling is None else cosine_ceiling
    -132            norm_weight = get_z_value(
    -133                cosine_ceiling=cosine_ceiling,
    -134                bm25_ceiling=self.top_bm25_scores[topic_num],
    -135            )
    -136            loguru.logger.debug(f"Automatic norm_weight: {norm_weight}")
    -137
    -138        params = {
    -139            "weights": [1] * (len(self.embed_mappings) * len(self.mappings)),
    -140            "offset": cosine_offset,
    -141            "norm_weight": norm_weight,
    -142            "disable_bm25": ablations,
    -143        }
    -144
    -145        embed_fields = []
    -146
    -147        for qfield in qfields:
    -148            for field in self.mappings:
    -149                should["should"].append(
    -150                    {
    -151                        "match": {
    -152                            f"{field}": {
    -153                                "query": self.topics[topic_num][qfield],
    -154                            }
    -155                        }
    -156                    }
    -157                )
    -158
    -159            params[f"{qfield}_eb"] = encoder.encode(
    -160                encoder, topic=self.topics[topic_num][qfield]
    -161            )
    -162            embed_fields.append(f"{qfield}_eb")
    -163
    -164        query = {
    -165            "query": {
    -166                "script_score": {
    -167                    "query": {
    -168                        "bool": should,
    -169                    },
    -170                    "script": generate_script(
    -171                        self.embed_mappings, params, qfields=embed_fields
    -172                    ),
    -173                }
    -174            }
    -175        }
    -176
    -177        loguru.logger.debug(query)
    -178        return query
    -179
    -180    @classmethod
    -181    def get_id_mapping(cls, hit):
    -182        """
    -183        Get the document ID
    -184
    -185        :param hit: The raw document result
    -186        :return:
    -187            The document's ID
    -188        """
    -189        return hit[cls.id_mapping]
    -
    - - -

    A generic elasticsearch query. Contains methods for NIR-style (embedding) queries and normal BM25 queries. -Requires topics, configs to be included

    -
    - - -
    - -
    - - GenericElasticsearchQuery( topics, config, top_bm25_scores=None, mappings=None, id_mapping=None, *args, **kwargs) - - - -
    - -
    31    def __init__(self, topics, config, top_bm25_scores=None, mappings=None, id_mapping=None, *args, **kwargs):
    -32        super().__init__(topics, config)
    -33
    -34        if id_mapping is None:
    -35            self.id_mapping = "id"
    -36
    -37        if mappings is None:
    -38            self.mappings = ["Text"]
    -39        else:
    -40            self.mappings = mappings
    -41
    -42        self.topics = topics
    -43        self.config = config
    -44        self.query_type = self.config.query_type
    -45
    -46        self.embed_mappings = ["Text_Embedding"]
    -47
    -48        self.query_funcs = {
    -49            "query": self.generate_query,
    -50            "embedding": self.generate_query_embedding,
    -51        }
    -52
    -53        self.top_bm25_scores = top_bm25_scores
    -
    - - - - -
    -
    - -
    - - def - generate_query(self, topic_num, *args, **kwargs): - - - -
    - -
    73    def generate_query(self, topic_num, *args, **kwargs):
    -74        """
    -75        Generates a simple BM25 query based off the query facets. Searches over all the document facets.
    -76        :param topic_num:
    -77        :param args:
    -78        :param kwargs:
    -79        :return:
    -80        """
    -81        _, _, should = self._generate_base_query(topic_num)
    -82
    -83        query = {
    -84            "query": {
    -85                "bool": should,
    -86            }
    -87        }
    -88
    -89        return query
    -
    - - -

    Generates a simple BM25 query based off the query facets. Searches over all the document facets.

    - -
    Parameters
    - -
      -
    • topic_num:
    • -
    • args:
    • -
    • kwargs:
    • -
    - -
    Returns
    -
    - - -
    -
    - -
    - - def - set_bm25_scores(self, scores: Dict[Union[str, int], Union[int, float]]): - - - -
    - -
    91    def set_bm25_scores(self, scores: Dict[Union[str, int], Union[int, float]]):
    -92        """
    -93        Sets BM25 scores that are used for NIR-style scoring. The top BM25 score for each topic is used
    -94        for log normalization.
    -95
    -96        Score = log(bm25)/log(z) + embed_score
    -97        :param scores: Top BM25 Scores of the form {topic_num: top_bm25_score}
    -98        """
    -99        self.top_bm25_scores = scores
    -
    - - -

    Sets BM25 scores that are used for NIR-style scoring. The top BM25 score for each topic is used -for log normalization.

    - -

    Score = log(bm25)/log(z) + embed_score

    - -
    Parameters
    - -
      -
    • scores: Top BM25 Scores of the form {topic_num: top_bm25_score}
    • -
    -
    - - -
    -
    - -
    - - def - has_bm25_scores(self): - - - -
    - -
    101    def has_bm25_scores(self):
    -102        """
    -103        Checks if BM25 scores have been set
    -104        :return:
    -105        """
    -106        return self.top_bm25_scores is not None
    -
    - - -

    Checks if BM25 scores have been set

    - -
    Returns
    -
    - - -
    -
    - -
    - - def - generate_query_embedding(self, *args, **kwargs): - - - -
    - -
    231    def use_config(self, *args, **kwargs):
    -232        """
    -233        Replaces keywords and args passed to the function with ones from self.config.
    -234
    -235        :param self:
    -236        :param args: To be updated
    -237        :param kwargs: To be updated
    -238        :return:
    -239        """
    -240        if self.config is not None:
    -241            kwargs = self.config.__update__(**kwargs)
    -242
    -243        return func(self, *args, **kwargs)
    -
    - - -

    Generates an embedding script score query for Elasticsearch as part of the NIR scoring function.

    - -
    Parameters
    - -
      -
    • topic_num: The topic number to search for
    • -
    • encoder: The encoder that will be used for encoding the topics
    • -
    • norm_weight: The BM25 log normalization constant
    • -
    • ablations: Whether to execute ablation style queries (i.e. one query facet -or one document facet at a time)
    • -
    • cosine_ceiling: Cosine ceiling used for automatic z-log normalization parameter calculation
    • -
    • args:
    • -
    • kwargs: Pass disable_cache to disable encoder caching
    • -
    - -
    Returns
    - -
    -
    An elasticsearch script_score query
    -
    -
    -
    - - -
    -
    - -
    -
    @classmethod
    - - def - get_id_mapping(cls, hit): - - - -
    - -
    180    @classmethod
    -181    def get_id_mapping(cls, hit):
    -182        """
    -183        Get the document ID
    -184
    -185        :param hit: The raw document result
    -186        :return:
    -187            The document's ID
    -188        """
    -189        return hit[cls.id_mapping]
    -
    - - -

    Get the document ID

    - -
    Parameters
    - -
      -
    • hit: The raw document result
    • -
    - -
    Returns
    - -
    -
    The document's ID
    -
    -
    -
    - - -
    -
    -
    - - \ No newline at end of file diff --git a/docs/search.js b/docs/search.js index 5d635ec..3526cc8 100644 --- a/docs/search.js +++ b/docs/search.js @@ -1,6 +1,6 @@ window.pdocSearch = (function(){ /** elasticlunr - http://weixsong.github.io * Copyright (C) 2017 Oliver Nightingale * Copyright (C) 2017 Wei Song * MIT Licensed */!function(){function e(e){if(null===e||"object"!=typeof e)return e;var t=e.constructor();for(var n in e)e.hasOwnProperty(n)&&(t[n]=e[n]);return t}var t=function(e){var n=new t.Index;return n.pipeline.add(t.trimmer,t.stopWordFilter,t.stemmer),e&&e.call(n,n),n};t.version="0.9.5",lunr=t,t.utils={},t.utils.warn=function(e){return function(t){e.console&&console.warn&&console.warn(t)}}(this),t.utils.toString=function(e){return void 0===e||null===e?"":e.toString()},t.EventEmitter=function(){this.events={}},t.EventEmitter.prototype.addListener=function(){var e=Array.prototype.slice.call(arguments),t=e.pop(),n=e;if("function"!=typeof t)throw new TypeError("last argument must be a function");n.forEach(function(e){this.hasHandler(e)||(this.events[e]=[]),this.events[e].push(t)},this)},t.EventEmitter.prototype.removeListener=function(e,t){if(this.hasHandler(e)){var n=this.events[e].indexOf(t);-1!==n&&(this.events[e].splice(n,1),0==this.events[e].length&&delete this.events[e])}},t.EventEmitter.prototype.emit=function(e){if(this.hasHandler(e)){var t=Array.prototype.slice.call(arguments,1);this.events[e].forEach(function(e){e.apply(void 0,t)},this)}},t.EventEmitter.prototype.hasHandler=function(e){return e in this.events},t.tokenizer=function(e){if(!arguments.length||null===e||void 0===e)return[];if(Array.isArray(e)){var n=e.filter(function(e){return null===e||void 0===e?!1:!0});n=n.map(function(e){return t.utils.toString(e).toLowerCase()});var i=[];return n.forEach(function(e){var n=e.split(t.tokenizer.seperator);i=i.concat(n)},this),i}return e.toString().trim().toLowerCase().split(t.tokenizer.seperator)},t.tokenizer.defaultSeperator=/[\s\-]+/,t.tokenizer.seperator=t.tokenizer.defaultSeperator,t.tokenizer.setSeperator=function(e){null!==e&&void 0!==e&&"object"==typeof e&&(t.tokenizer.seperator=e)},t.tokenizer.resetSeperator=function(){t.tokenizer.seperator=t.tokenizer.defaultSeperator},t.tokenizer.getSeperator=function(){return t.tokenizer.seperator},t.Pipeline=function(){this._queue=[]},t.Pipeline.registeredFunctions={},t.Pipeline.registerFunction=function(e,n){n in t.Pipeline.registeredFunctions&&t.utils.warn("Overwriting existing registered function: "+n),e.label=n,t.Pipeline.registeredFunctions[n]=e},t.Pipeline.getRegisteredFunction=function(e){return e in t.Pipeline.registeredFunctions!=!0?null:t.Pipeline.registeredFunctions[e]},t.Pipeline.warnIfFunctionNotRegistered=function(e){var n=e.label&&e.label in this.registeredFunctions;n||t.utils.warn("Function is not registered with pipeline. This may cause problems when serialising the index.\n",e)},t.Pipeline.load=function(e){var n=new t.Pipeline;return e.forEach(function(e){var i=t.Pipeline.getRegisteredFunction(e);if(!i)throw new Error("Cannot load un-registered function: "+e);n.add(i)}),n},t.Pipeline.prototype.add=function(){var e=Array.prototype.slice.call(arguments);e.forEach(function(e){t.Pipeline.warnIfFunctionNotRegistered(e),this._queue.push(e)},this)},t.Pipeline.prototype.after=function(e,n){t.Pipeline.warnIfFunctionNotRegistered(n);var i=this._queue.indexOf(e);if(-1===i)throw new Error("Cannot find existingFn");this._queue.splice(i+1,0,n)},t.Pipeline.prototype.before=function(e,n){t.Pipeline.warnIfFunctionNotRegistered(n);var i=this._queue.indexOf(e);if(-1===i)throw new Error("Cannot find existingFn");this._queue.splice(i,0,n)},t.Pipeline.prototype.remove=function(e){var t=this._queue.indexOf(e);-1!==t&&this._queue.splice(t,1)},t.Pipeline.prototype.run=function(e){for(var t=[],n=e.length,i=this._queue.length,o=0;n>o;o++){for(var r=e[o],s=0;i>s&&(r=this._queue[s](r,o,e),void 0!==r&&null!==r);s++);void 0!==r&&null!==r&&t.push(r)}return t},t.Pipeline.prototype.reset=function(){this._queue=[]},t.Pipeline.prototype.get=function(){return this._queue},t.Pipeline.prototype.toJSON=function(){return this._queue.map(function(e){return t.Pipeline.warnIfFunctionNotRegistered(e),e.label})},t.Index=function(){this._fields=[],this._ref="id",this.pipeline=new t.Pipeline,this.documentStore=new t.DocumentStore,this.index={},this.eventEmitter=new t.EventEmitter,this._idfCache={},this.on("add","remove","update",function(){this._idfCache={}}.bind(this))},t.Index.prototype.on=function(){var e=Array.prototype.slice.call(arguments);return this.eventEmitter.addListener.apply(this.eventEmitter,e)},t.Index.prototype.off=function(e,t){return this.eventEmitter.removeListener(e,t)},t.Index.load=function(e){e.version!==t.version&&t.utils.warn("version mismatch: current "+t.version+" importing "+e.version);var n=new this;n._fields=e.fields,n._ref=e.ref,n.documentStore=t.DocumentStore.load(e.documentStore),n.pipeline=t.Pipeline.load(e.pipeline),n.index={};for(var i in e.index)n.index[i]=t.InvertedIndex.load(e.index[i]);return n},t.Index.prototype.addField=function(e){return this._fields.push(e),this.index[e]=new t.InvertedIndex,this},t.Index.prototype.setRef=function(e){return this._ref=e,this},t.Index.prototype.saveDocument=function(e){return this.documentStore=new t.DocumentStore(e),this},t.Index.prototype.addDoc=function(e,n){if(e){var n=void 0===n?!0:n,i=e[this._ref];this.documentStore.addDoc(i,e),this._fields.forEach(function(n){var o=this.pipeline.run(t.tokenizer(e[n]));this.documentStore.addFieldLength(i,n,o.length);var r={};o.forEach(function(e){e in r?r[e]+=1:r[e]=1},this);for(var s in r){var u=r[s];u=Math.sqrt(u),this.index[n].addToken(s,{ref:i,tf:u})}},this),n&&this.eventEmitter.emit("add",e,this)}},t.Index.prototype.removeDocByRef=function(e){if(e&&this.documentStore.isDocStored()!==!1&&this.documentStore.hasDoc(e)){var t=this.documentStore.getDoc(e);this.removeDoc(t,!1)}},t.Index.prototype.removeDoc=function(e,n){if(e){var n=void 0===n?!0:n,i=e[this._ref];this.documentStore.hasDoc(i)&&(this.documentStore.removeDoc(i),this._fields.forEach(function(n){var o=this.pipeline.run(t.tokenizer(e[n]));o.forEach(function(e){this.index[n].removeToken(e,i)},this)},this),n&&this.eventEmitter.emit("remove",e,this))}},t.Index.prototype.updateDoc=function(e,t){var t=void 0===t?!0:t;this.removeDocByRef(e[this._ref],!1),this.addDoc(e,!1),t&&this.eventEmitter.emit("update",e,this)},t.Index.prototype.idf=function(e,t){var n="@"+t+"/"+e;if(Object.prototype.hasOwnProperty.call(this._idfCache,n))return this._idfCache[n];var i=this.index[t].getDocFreq(e),o=1+Math.log(this.documentStore.length/(i+1));return this._idfCache[n]=o,o},t.Index.prototype.getFields=function(){return this._fields.slice()},t.Index.prototype.search=function(e,n){if(!e)return[];e="string"==typeof e?{any:e}:JSON.parse(JSON.stringify(e));var i=null;null!=n&&(i=JSON.stringify(n));for(var o=new t.Configuration(i,this.getFields()).get(),r={},s=Object.keys(e),u=0;u0&&t.push(e);for(var i in n)"docs"!==i&&"df"!==i&&this.expandToken(e+i,t,n[i]);return t},t.InvertedIndex.prototype.toJSON=function(){return{root:this.root}},t.Configuration=function(e,n){var e=e||"";if(void 0==n||null==n)throw new Error("fields should not be null");this.config={};var i;try{i=JSON.parse(e),this.buildUserConfig(i,n)}catch(o){t.utils.warn("user configuration parse failed, will use default configuration"),this.buildDefaultConfig(n)}},t.Configuration.prototype.buildDefaultConfig=function(e){this.reset(),e.forEach(function(e){this.config[e]={boost:1,bool:"OR",expand:!1}},this)},t.Configuration.prototype.buildUserConfig=function(e,n){var i="OR",o=!1;if(this.reset(),"bool"in e&&(i=e.bool||i),"expand"in e&&(o=e.expand||o),"fields"in e)for(var r in e.fields)if(n.indexOf(r)>-1){var s=e.fields[r],u=o;void 0!=s.expand&&(u=s.expand),this.config[r]={boost:s.boost||0===s.boost?s.boost:1,bool:s.bool||i,expand:u}}else t.utils.warn("field name in user configuration not found in index instance fields");else this.addAllFields2UserConfig(i,o,n)},t.Configuration.prototype.addAllFields2UserConfig=function(e,t,n){n.forEach(function(n){this.config[n]={boost:1,bool:e,expand:t}},this)},t.Configuration.prototype.get=function(){return this.config},t.Configuration.prototype.reset=function(){this.config={}},lunr.SortedSet=function(){this.length=0,this.elements=[]},lunr.SortedSet.load=function(e){var t=new this;return t.elements=e,t.length=e.length,t},lunr.SortedSet.prototype.add=function(){var e,t;for(e=0;e1;){if(r===e)return o;e>r&&(t=o),r>e&&(n=o),i=n-t,o=t+Math.floor(i/2),r=this.elements[o]}return r===e?o:-1},lunr.SortedSet.prototype.locationFor=function(e){for(var t=0,n=this.elements.length,i=n-t,o=t+Math.floor(i/2),r=this.elements[o];i>1;)e>r&&(t=o),r>e&&(n=o),i=n-t,o=t+Math.floor(i/2),r=this.elements[o];return r>e?o:e>r?o+1:void 0},lunr.SortedSet.prototype.intersect=function(e){for(var t=new lunr.SortedSet,n=0,i=0,o=this.length,r=e.length,s=this.elements,u=e.elements;;){if(n>o-1||i>r-1)break;s[n]!==u[i]?s[n]u[i]&&i++:(t.add(s[n]),n++,i++)}return t},lunr.SortedSet.prototype.clone=function(){var e=new lunr.SortedSet;return e.elements=this.toArray(),e.length=e.elements.length,e},lunr.SortedSet.prototype.union=function(e){var t,n,i;this.length>=e.length?(t=this,n=e):(t=e,n=this),i=t.clone();for(var o=0,r=n.toArray();oThe DeBEIR (Dense Bi-Encoder Information Retrieval) source code library.

    \n\n

    See ./main.py in the parent directory for an out-of-the-box runnable code.

    \n\n

    Otherwise, check out notebooks in the parent directory for training your own model amongst other things.

    \n"}, {"fullname": "debeir.core", "modulename": "debeir.core", "kind": "module", "doc": "

    Core library interfaces that must be implemented for custom datasets

    \n\n

    Interfaces to implement custom data_sets in nir.data_sets.

    \n"}, {"fullname": "debeir.core.callbacks", "modulename": "debeir.core.callbacks", "kind": "module", "doc": "

    Callbacks for before after running.\nE.g. before is for setup\nafter is for evaluation/serialization etc

    \n"}, {"fullname": "debeir.core.callbacks.Callback", "modulename": "debeir.core.callbacks", "qualname": "Callback", "kind": "class", "doc": "

    \n"}, {"fullname": "debeir.core.callbacks.Callback.__init__", "modulename": "debeir.core.callbacks", "qualname": "Callback.__init__", "kind": "function", "doc": "

    \n", "signature": "()"}, {"fullname": "debeir.core.callbacks.Callback.before", "modulename": "debeir.core.callbacks", "qualname": "Callback.before", "kind": "function", "doc": "

    \n", "signature": "(self, pipeline: debeir.core.pipeline.Pipeline):", "funcdef": "def"}, {"fullname": "debeir.core.callbacks.Callback.after", "modulename": "debeir.core.callbacks", "qualname": "Callback.after", "kind": "function", "doc": "

    \n", "signature": "(self, results: List):", "funcdef": "def"}, {"fullname": "debeir.core.callbacks.SerializationCallback", "modulename": "debeir.core.callbacks", "qualname": "SerializationCallback", "kind": "class", "doc": "

    \n", "bases": "Callback"}, {"fullname": "debeir.core.callbacks.SerializationCallback.__init__", "modulename": "debeir.core.callbacks", "qualname": "SerializationCallback.__init__", "kind": "function", "doc": "

    \n", "signature": "(\tconfig: debeir.core.config.GenericConfig,\tnir_config: debeir.core.config.NIRConfig)"}, {"fullname": "debeir.core.callbacks.SerializationCallback.before", "modulename": "debeir.core.callbacks", "qualname": "SerializationCallback.before", "kind": "function", "doc": "

    Check if output file exists

    \n\n
    Returns
    \n\n
    \n
    Output file path\n
    \n
    \n", "signature": "(self, pipeline: debeir.core.pipeline.Pipeline):", "funcdef": "def"}, {"fullname": "debeir.core.callbacks.SerializationCallback.after", "modulename": "debeir.core.callbacks", "qualname": "SerializationCallback.after", "kind": "function", "doc": "

    Serialize results to self.output_file in a TREC-style format

    \n\n
    Parameters
    \n\n
      \n
    • topic_num: Topic number to serialize
    • \n
    • res: Raw elasticsearch result
    • \n
    • run_name: The run name for TREC-style runs (default: NO_RUN_NAME)
    • \n
    \n", "signature": "(self, results: List):", "funcdef": "def"}, {"fullname": "debeir.core.callbacks.EvaluationCallback", "modulename": "debeir.core.callbacks", "qualname": "EvaluationCallback", "kind": "class", "doc": "

    \n", "bases": "Callback"}, {"fullname": "debeir.core.callbacks.EvaluationCallback.__init__", "modulename": "debeir.core.callbacks", "qualname": "EvaluationCallback.__init__", "kind": "function", "doc": "

    \n", "signature": "(evaluator: debeir.evaluation.evaluator.Evaluator, config)"}, {"fullname": "debeir.core.callbacks.EvaluationCallback.before", "modulename": "debeir.core.callbacks", "qualname": "EvaluationCallback.before", "kind": "function", "doc": "

    \n", "signature": "(self, pipeline: debeir.core.pipeline.Pipeline):", "funcdef": "def"}, {"fullname": "debeir.core.callbacks.EvaluationCallback.after", "modulename": "debeir.core.callbacks", "qualname": "EvaluationCallback.after", "kind": "function", "doc": "

    \n", "signature": "(self, results: List, id_field='id'):", "funcdef": "def"}, {"fullname": "debeir.core.config", "modulename": "debeir.core.config", "kind": "module", "doc": "

    \n"}, {"fullname": "debeir.core.config.Config", "modulename": "debeir.core.config", "qualname": "Config", "kind": "class", "doc": "

    Config Interface with creation class methods

    \n"}, {"fullname": "debeir.core.config.Config.__init__", "modulename": "debeir.core.config", "qualname": "Config.__init__", "kind": "function", "doc": "

    \n", "signature": "()"}, {"fullname": "debeir.core.config.Config.from_toml", "modulename": "debeir.core.config", "qualname": "Config.from_toml", "kind": "function", "doc": "

    Instantiates a Config object from a toml file

    \n\n
    Parameters
    \n\n
      \n
    • fp: File path of the Config TOML file
    • \n
    • field_class: Class of the Config object to be instantiated
    • \n
    • args: Arguments to be passed to Config
    • \n
    • kwargs: Keyword arguments to be passed
    • \n
    \n\n
    Returns
    \n\n
    \n
    A instantiated and validated Config object.\n
    \n
    \n", "signature": "(\tcls,\tfp: Union[str, pathlib.Path],\tfield_class,\t*args,\t**kwargs) -> debeir.core.config.Config:", "funcdef": "def"}, {"fullname": "debeir.core.config.Config.from_args", "modulename": "debeir.core.config", "qualname": "Config.from_args", "kind": "function", "doc": "

    Instantiates a Config object from arguments

    \n\n
    Parameters
    \n\n
      \n
    • args_dict:
    • \n
    • field_class:
    • \n
    • args:
    • \n
    • kwargs:
    • \n
    \n\n
    Returns
    \n", "signature": "(cls, args_dict: MutableMapping, field_class, *args, **kwargs):", "funcdef": "def"}, {"fullname": "debeir.core.config.Config.from_dict", "modulename": "debeir.core.config", "qualname": "Config.from_dict", "kind": "function", "doc": "

    Instantiates a Config object from a dictionary

    \n\n
    Parameters
    \n\n
      \n
    • data_class:
    • \n
    • kwargs:
    • \n
    \n\n
    Returns
    \n", "signature": "(cls, data_class, **kwargs):", "funcdef": "def"}, {"fullname": "debeir.core.config.Config.validate", "modulename": "debeir.core.config", "qualname": "Config.validate", "kind": "function", "doc": "

    Validates if the config is correct.\nMust be implemented by inherited classes.

    \n", "signature": "(self):", "funcdef": "def"}, {"fullname": "debeir.core.config.GenericConfig", "modulename": "debeir.core.config", "qualname": "GenericConfig", "kind": "class", "doc": "

    Generic NIR Configuration file for which all configs will inherit

    \n", "bases": "Config, abc.ABC"}, {"fullname": "debeir.core.config.GenericConfig.__init__", "modulename": "debeir.core.config", "qualname": "GenericConfig.__init__", "kind": "function", "doc": "

    \n", "signature": "(\tquery_type: str,\tindex: str = None,\tencoder_normalize: bool = True,\tablations: bool = False,\tnorm_weight: float = None,\tautomatic: bool = None,\tencoder: object = None,\tencoder_fp: str = None,\tquery_weights: List[float] = None,\tcosine_weights: List[float] = None,\tevaluate: bool = False,\tqrels: str = None,\tconfig_fn: str = None,\tquery_fn: str = None,\tparser_fn: str = None,\texecutor_fn: str = None,\tcosine_ceiling: float = None,\ttopics_path: str = None,\treturn_id_only: bool = False,\toverwrite_output_if_exists: bool = False,\toutput_file: str = None,\trun_name: str = None)"}, {"fullname": "debeir.core.config.GenericConfig.from_toml", "modulename": "debeir.core.config", "qualname": "GenericConfig.from_toml", "kind": "function", "doc": "

    Instantiates a Config object from a toml file

    \n\n
    Parameters
    \n\n
      \n
    • fp: File path of the Config TOML file
    • \n
    • field_class: Class of the Config object to be instantiated
    • \n
    • args: Arguments to be passed to Config
    • \n
    • kwargs: Keyword arguments to be passed
    • \n
    \n\n
    Returns
    \n\n
    \n
    A instantiated and validated Config object.\n
    \n
    \n", "signature": "(\tcls,\tfp: Union[str, pathlib.Path],\t*args,\t**kwargs) -> debeir.core.config.GenericConfig:", "funcdef": "def"}, {"fullname": "debeir.core.config.ElasticsearchConfig", "modulename": "debeir.core.config", "qualname": "ElasticsearchConfig", "kind": "class", "doc": "

    Basic Elasticsearch configuration file settings from the master nir.toml file

    \n", "bases": "Config"}, {"fullname": "debeir.core.config.ElasticsearchConfig.__init__", "modulename": "debeir.core.config", "qualname": "ElasticsearchConfig.__init__", "kind": "function", "doc": "

    \n", "signature": "(protocol: str, ip: str, port: str, timeout: int)"}, {"fullname": "debeir.core.config.ElasticsearchConfig.validate", "modulename": "debeir.core.config", "qualname": "ElasticsearchConfig.validate", "kind": "function", "doc": "

    Checks if Elasticsearch URL is correct

    \n", "signature": "(self):", "funcdef": "def"}, {"fullname": "debeir.core.config.SolrConfig", "modulename": "debeir.core.config", "qualname": "SolrConfig", "kind": "class", "doc": "

    Basic Solr configuration file settings from the master nir.toml file

    \n", "bases": "ElasticsearchConfig"}, {"fullname": "debeir.core.config.SolrConfig.__init__", "modulename": "debeir.core.config", "qualname": "SolrConfig.__init__", "kind": "function", "doc": "

    \n", "signature": "(protocol: str, ip: str, port: str, timeout: int)"}, {"fullname": "debeir.core.config.MetricsConfig", "modulename": "debeir.core.config", "qualname": "MetricsConfig", "kind": "class", "doc": "

    Basic Metrics configuration file settings from the master nir.toml file

    \n", "bases": "Config"}, {"fullname": "debeir.core.config.MetricsConfig.__init__", "modulename": "debeir.core.config", "qualname": "MetricsConfig.__init__", "kind": "function", "doc": "

    \n", "signature": "(metrics: List[str])"}, {"fullname": "debeir.core.config.MetricsConfig.validate", "modulename": "debeir.core.config", "qualname": "MetricsConfig.validate", "kind": "function", "doc": "

    Checks if each Metrics is usable by evaluator classes

    \n", "signature": "(self):", "funcdef": "def"}, {"fullname": "debeir.core.config.NIRConfig", "modulename": "debeir.core.config", "qualname": "NIRConfig", "kind": "class", "doc": "

    Basic NIR configuration file settings from the master nir.toml file

    \n", "bases": "Config"}, {"fullname": "debeir.core.config.NIRConfig.__init__", "modulename": "debeir.core.config", "qualname": "NIRConfig.__init__", "kind": "function", "doc": "

    \n", "signature": "(\tnorm_weight: str,\tevaluate: bool,\treturn_size: int,\toutput_directory: str)"}, {"fullname": "debeir.core.config.NIRConfig.validate", "modulename": "debeir.core.config", "qualname": "NIRConfig.validate", "kind": "function", "doc": "

    Validates if the config is correct.\nMust be implemented by inherited classes.

    \n", "signature": "(self):", "funcdef": "def"}, {"fullname": "debeir.core.config.apply_config", "modulename": "debeir.core.config", "qualname": "apply_config", "kind": "function", "doc": "

    Configuration decorator.

    \n\n
    Parameters
    \n\n
      \n
    • func: Decorated function
    • \n
    \n\n
    Returns
    \n", "signature": "(func):", "funcdef": "def"}, {"fullname": "debeir.core.config.override_with_toml_config", "modulename": "debeir.core.config", "qualname": "override_with_toml_config", "kind": "function", "doc": "

    Configuration decorator. Overwrite a functions kwargs and args with a specified toml config file.\nPass override_with_config=path/to/config

    \n\n
    Parameters
    \n\n
      \n
    • func: Decorated function
    • \n
    \n\n
    Returns
    \n", "signature": "(func):", "funcdef": "def"}, {"fullname": "debeir.core.config.save_kwargs_to_file", "modulename": "debeir.core.config", "qualname": "save_kwargs_to_file", "kind": "function", "doc": "

    \n", "signature": "(func):", "funcdef": "def"}, {"fullname": "debeir.core.converters", "modulename": "debeir.core.converters", "kind": "module", "doc": "

    \n"}, {"fullname": "debeir.core.converters.ParsedTopicsToDataset", "modulename": "debeir.core.converters", "qualname": "ParsedTopicsToDataset", "kind": "class", "doc": "

    Converts a parser's output to a huggingface dataset object.

    \n"}, {"fullname": "debeir.core.converters.ParsedTopicsToDataset.__init__", "modulename": "debeir.core.converters", "qualname": "ParsedTopicsToDataset.__init__", "kind": "function", "doc": "

    \n", "signature": "()"}, {"fullname": "debeir.core.converters.ParsedTopicsToDataset.convert", "modulename": "debeir.core.converters", "qualname": "ParsedTopicsToDataset.convert", "kind": "function", "doc": "

    Flatten a Dict of shape (traditional parser output)\n{topic_id: {\n \"Facet_1\": ...\n \"Facet_2\": ...\n }\n}

    \n\n

    ->

    \n\n

    To a flattened arrow-like dataset.\n{\ntopic_ids: [],\nFacet_1s: [],\nFacet_2s: [],\n}

    \n\n
    Parameters
    \n\n
      \n
    • output: Topics output from the parser object
    • \n
    \n\n
    Returns
    \n", "signature": "(\tcls,\tparser: debeir.core.parser.Parser,\toutput: Dict[Union[str, int], Dict]):", "funcdef": "def"}, {"fullname": "debeir.core.document", "modulename": "debeir.core.document", "kind": "module", "doc": "

    \n"}, {"fullname": "debeir.core.document.Document", "modulename": "debeir.core.document", "qualname": "Document", "kind": "class", "doc": "

    Generic Document class.\nUsed as an interface for interacting across multiple indexes with different mappings.

    \n"}, {"fullname": "debeir.core.document.Document.__init__", "modulename": "debeir.core.document", "qualname": "Document.__init__", "kind": "function", "doc": "

    \n", "signature": "(\tdoc_id: Union[int, float, str],\ttopic_num: Union[int, str, float] = None,\tfacets: Dict = None,\tscore: Union[float, int] = 0.0,\tscores: Dict[str, Union[float, int]] = <factory>)"}, {"fullname": "debeir.core.document.Document.from_results", "modulename": "debeir.core.document", "qualname": "Document.from_results", "kind": "function", "doc": "

    Produces a list of Document objects from raw results returned from the index

    \n\n

    In the format {topic_num: [Document, ..., Document]}

    \n", "signature": "(\tcls,\tresults,\t*args,\t**kwargs) -> Dict[Union[int, float], debeir.core.document.Document]:", "funcdef": "def"}, {"fullname": "debeir.core.document.Document.get_document_id", "modulename": "debeir.core.document", "qualname": "Document.get_document_id", "kind": "function", "doc": "
    Returns
    \n\n
    \n
    self.doc_id\n
    \n
    \n", "signature": "(self):", "funcdef": "def"}, {"fullname": "debeir.core.document.Document.flatten_facets", "modulename": "debeir.core.document", "qualname": "Document.flatten_facets", "kind": "function", "doc": "

    Flattens multi-level internal document facets into a single level\n e.g. Doc['Upper']['Lower'] -> Doc['Upper_Lower']

    \n\n
    Parameters
    \n\n
      \n
    • args:
    • \n
    • kwargs:
    • \n
    \n", "signature": "(self, *args, **kwargs):", "funcdef": "def"}, {"fullname": "debeir.core.document.Document.get_document_facet", "modulename": "debeir.core.document", "qualname": "Document.get_document_facet", "kind": "function", "doc": "

    Retrieve a document facet\nWorks for multidimensional keys or single

    \n\n
    Parameters
    \n\n
      \n
    • key: Facet to retrieve
    • \n
    • sep: The seperator for multidimensional key
    • \n
    \n\n
    Returns
    \n\n
    \n
    Returns the document facet given the key (field)\n
    \n
    \n", "signature": "(self, key, sep='_'):", "funcdef": "def"}, {"fullname": "debeir.core.document.Document.set", "modulename": "debeir.core.document", "qualname": "Document.set", "kind": "function", "doc": "

    Set attributes of the object. Use keyword arguments to do so. Works as a builder class.\ndoc.set(doc_id=\"123\").set(facets={\"title\": \"my title\"})

    \n\n
    Parameters
    \n\n
      \n
    • doc_id:
    • \n
    • facets:
    • \n
    • score:
    • \n
    • facet:
    • \n
    • facet_value:
    • \n
    \n\n
    Returns
    \n\n
    \n
    Returns document object\n
    \n
    \n", "signature": "(\tself,\tdoc_id=None,\tfacets=None,\tscore=None,\tfacet=None,\tfacet_value=None) -> debeir.core.document.Document:", "funcdef": "def"}, {"fullname": "debeir.core.document.Document.to_trec_format", "modulename": "debeir.core.document", "qualname": "Document.to_trec_format", "kind": "function", "doc": "

    Returns TREC format for the document

    \n\n
    Returns
    \n\n
    \n
    A trec formatted string\n
    \n
    \n", "signature": "(self, rank, run_name) -> str:", "funcdef": "def"}, {"fullname": "debeir.core.document.Document.get_trec_format", "modulename": "debeir.core.document", "qualname": "Document.get_trec_format", "kind": "function", "doc": "

    Get the trec format of a list of ranked documents. This function is a generator.

    \n\n
    Parameters
    \n\n
      \n
    • ranked_list: A list of Document-type objects
    • \n
    • run_name: Run name to print in the TREC formatted string
    • \n
    • sort: Whether to sort the input list in descending order of score.
    • \n
    • sorting_func: Custom sorting function will be used if provided
    • \n
    \n", "signature": "(\tcls,\tranked_list: List[debeir.core.document.Document],\trun_name='NO_RUN_NAME',\tsort=True,\tsorting_func=None):", "funcdef": "def"}, {"fullname": "debeir.core.document.ElasticsearchDocument", "modulename": "debeir.core.document", "qualname": "ElasticsearchDocument", "kind": "class", "doc": "

    Generic Document class.\nUsed as an interface for interacting across multiple indexes with different mappings.

    \n", "bases": "Document"}, {"fullname": "debeir.core.document.ElasticsearchDocument.from_results", "modulename": "debeir.core.document", "qualname": "ElasticsearchDocument.from_results", "kind": "function", "doc": "

    Produces a list of Document objects from raw results returned from the index

    \n\n

    In the format {topic_num: [Document, ..., Document]}

    \n", "signature": "(\tcls,\tresults,\tquery_cls,\tignore_facets=True,\t*args,\t**kwargs) -> Dict[Union[int, float], debeir.core.document.Document]:", "funcdef": "def"}, {"fullname": "debeir.core.executor", "modulename": "debeir.core.executor", "kind": "module", "doc": "

    \n"}, {"fullname": "debeir.core.executor.GenericElasticsearchExecutor", "modulename": "debeir.core.executor", "qualname": "GenericElasticsearchExecutor", "kind": "class", "doc": "

    Generic Executor class for Elasticsearch

    \n", "bases": "debeir.engines.elasticsearch.executor.ElasticsearchExecutor"}, {"fullname": "debeir.core.executor.GenericElasticsearchExecutor.__init__", "modulename": "debeir.core.executor", "qualname": "GenericElasticsearchExecutor.__init__", "kind": "function", "doc": "

    \n", "signature": "(\ttopics: Dict[Union[str, int], Dict[str, str]],\tclient: elasticsearch.AsyncElasticsearch,\tindex_name: str,\toutput_file: str,\tquery: debeir.core.query.GenericElasticsearchQuery,\tencoder: Optional[debeir.rankers.transformer_sent_encoder.Encoder] = None,\tconfig=None,\t*args,\t**kwargs)"}, {"fullname": "debeir.core.executor.GenericElasticsearchExecutor.generate_query", "modulename": "debeir.core.executor", "qualname": "GenericElasticsearchExecutor.generate_query", "kind": "function", "doc": "

    Generates a standard BM25 query given the topic number

    \n\n
    Parameters
    \n\n
      \n
    • topic_num: Query topic number to generate
    • \n
    • best_fields: Whether to use a curated list of fields
    • \n
    • kwargs:
    • \n
    \n\n
    Returns
    \n", "signature": "(self, topic_num, best_fields=True, **kwargs):", "funcdef": "def"}, {"fullname": "debeir.core.executor.GenericElasticsearchExecutor.generate_embedding_query", "modulename": "debeir.core.executor", "qualname": "GenericElasticsearchExecutor.generate_embedding_query", "kind": "function", "doc": "

    Executes an NIR-style query with combined scoring.

    \n\n
    Parameters
    \n\n
      \n
    • topic_num:
    • \n
    • cosine_weights:
    • \n
    • query_weights:
    • \n
    • norm_weight:
    • \n
    • automatic_scores:
    • \n
    • kwargs:
    • \n
    \n\n
    Returns
    \n", "signature": "(\tself,\ttopic_num,\tcosine_weights=None,\tquery_weights=None,\tnorm_weight=2.15,\tautomatic_scores=None,\t**kwargs):", "funcdef": "def"}, {"fullname": "debeir.core.executor.GenericElasticsearchExecutor.execute_query", "modulename": "debeir.core.executor", "qualname": "GenericElasticsearchExecutor.execute_query", "kind": "function", "doc": "

    Execute a query given parameters

    \n\n
    Parameters
    \n\n
      \n
    • args:
    • \n
    • kwargs:
    • \n
    \n", "signature": "(\tself,\tquery=None,\treturn_size: int = None,\treturn_id_only: bool = None,\ttopic_num=None,\tablation=False,\tquery_type=None,\t**kwargs):", "funcdef": "async def"}, {"fullname": "debeir.core.executor.GenericElasticsearchExecutor.run_automatic_adjustment", "modulename": "debeir.core.executor", "qualname": "GenericElasticsearchExecutor.run_automatic_adjustment", "kind": "function", "doc": "

    Get the normalization constant to be used in NIR-style queries for all topics given an initial\nrun of BM25 results.

    \n", "signature": "(self, return_results=False):", "funcdef": "async def"}, {"fullname": "debeir.core.executor.GenericElasticsearchExecutor.build_from_config", "modulename": "debeir.core.executor", "qualname": "GenericElasticsearchExecutor.build_from_config", "kind": "function", "doc": "

    Build an query executor engine from a config file.

    \n", "signature": "(\tcls,\ttopics: Dict,\tquery_obj: debeir.core.query.GenericElasticsearchQuery,\tclient,\tconfig: debeir.core.config.GenericConfig,\tnir_config: debeir.core.config.NIRConfig):", "funcdef": "def"}, {"fullname": "debeir.core.indexer", "modulename": "debeir.core.indexer", "kind": "module", "doc": "

    \n"}, {"fullname": "debeir.core.indexer.Indexer", "modulename": "debeir.core.indexer", "qualname": "Indexer", "kind": "class", "doc": "

    \n"}, {"fullname": "debeir.core.indexer.Indexer.__init__", "modulename": "debeir.core.indexer", "qualname": "Indexer.__init__", "kind": "function", "doc": "

    \n", "signature": "(client)"}, {"fullname": "debeir.core.indexer.Indexer.get_field", "modulename": "debeir.core.indexer", "qualname": "Indexer.get_field", "kind": "function", "doc": "

    \n", "signature": "(self, document, field):", "funcdef": "def"}, {"fullname": "debeir.core.indexer.SemanticElasticsearchIndexer", "modulename": "debeir.core.indexer", "qualname": "SemanticElasticsearchIndexer", "kind": "class", "doc": "

    Create a NIR-style index, with dense field representations with provided sentence encoder\nAssumes you've already indexed to start with.

    \n", "bases": "Indexer, threading.Thread"}, {"fullname": "debeir.core.indexer.SemanticElasticsearchIndexer.__init__", "modulename": "debeir.core.indexer", "qualname": "SemanticElasticsearchIndexer.__init__", "kind": "function", "doc": "

    This constructor should always be called with keyword arguments. Arguments are:

    \n\n

    group should be None; reserved for future extension when a ThreadGroup\nclass is implemented.

    \n\n

    target is the callable object to be invoked by the run()\nmethod. Defaults to None, meaning nothing is called.

    \n\n

    name is the thread name. By default, a unique name is constructed of\nthe form \"Thread-N\" where N is a small decimal number.

    \n\n

    args is the argument tuple for the target invocation. Defaults to ().

    \n\n

    kwargs is a dictionary of keyword arguments for the target\ninvocation. Defaults to {}.

    \n\n

    If a subclass overrides the constructor, it must make sure to invoke\nthe base class constructor (Thread.__init__()) before doing anything\nelse to the thread.

    \n", "signature": "(\tes_client: elasticsearch.Elasticsearch,\tencoder: debeir.rankers.transformer_sent_encoder.Encoder,\tindex: str,\tfields_to_encode: List[str],\tqueue: queue.Queue)"}, {"fullname": "debeir.core.indexer.SemanticElasticsearchIndexer.update_mappings", "modulename": "debeir.core.indexer", "qualname": "SemanticElasticsearchIndexer.update_mappings", "kind": "function", "doc": "

    \n", "signature": "(self, index, fields, client: elasticsearch.Elasticsearch):", "funcdef": "def"}, {"fullname": "debeir.core.indexer.SemanticElasticsearchIndexer.get_field", "modulename": "debeir.core.indexer", "qualname": "SemanticElasticsearchIndexer.get_field", "kind": "function", "doc": "

    \n", "signature": "(self, document, field):", "funcdef": "def"}, {"fullname": "debeir.core.indexer.SemanticElasticsearchIndexer.index_document", "modulename": "debeir.core.indexer", "qualname": "SemanticElasticsearchIndexer.index_document", "kind": "function", "doc": "

    \n", "signature": "(self, document):", "funcdef": "def"}, {"fullname": "debeir.core.indexer.SemanticElasticsearchIndexer.run", "modulename": "debeir.core.indexer", "qualname": "SemanticElasticsearchIndexer.run", "kind": "function", "doc": "

    Method representing the thread's activity.

    \n\n

    You may override this method in a subclass. The standard run() method\ninvokes the callable object passed to the object's constructor as the\ntarget argument, if any, with sequential and keyword arguments taken\nfrom the args and kwargs arguments, respectively.

    \n", "signature": "(self):", "funcdef": "def"}, {"fullname": "debeir.core.parser", "modulename": "debeir.core.parser", "kind": "module", "doc": "

    \n"}, {"fullname": "debeir.core.parser.Parser", "modulename": "debeir.core.parser", "qualname": "Parser", "kind": "class", "doc": "

    Parser interface

    \n"}, {"fullname": "debeir.core.parser.Parser.__init__", "modulename": "debeir.core.parser", "qualname": "Parser.__init__", "kind": "function", "doc": "

    \n", "signature": "(id_field: object, parse_fields: List[str])"}, {"fullname": "debeir.core.parser.Parser.normalize", "modulename": "debeir.core.parser", "qualname": "Parser.normalize", "kind": "function", "doc": "

    Flatten the dictionary, i.e. from Dict[int, Dict] -> Dict[str, str_or_int]

    \n\n
    Parameters
    \n\n
      \n
    • input_dict:
    • \n
    \n\n
    Returns
    \n", "signature": "(cls, input_dict) -> Dict:", "funcdef": "def"}, {"fullname": "debeir.core.parser.Parser.get_topics", "modulename": "debeir.core.parser", "qualname": "Parser.get_topics", "kind": "function", "doc": "

    Instance method for getting topics, forwards instance self parameters to the _get_topics class method.

    \n", "signature": "(self, path, *args, **kwargs):", "funcdef": "def"}, {"fullname": "debeir.core.parser.PickleParser", "modulename": "debeir.core.parser", "qualname": "PickleParser", "kind": "class", "doc": "

    Load topics from a pickle file

    \n", "bases": "Parser"}, {"fullname": "debeir.core.parser.PickleParser.__init__", "modulename": "debeir.core.parser", "qualname": "PickleParser.__init__", "kind": "function", "doc": "

    \n", "signature": "(id_field: object, parse_fields: List[str])"}, {"fullname": "debeir.core.parser.XMLParser", "modulename": "debeir.core.parser", "qualname": "XMLParser", "kind": "class", "doc": "

    Load topics from an XML file

    \n", "bases": "Parser"}, {"fullname": "debeir.core.parser.XMLParser.__init__", "modulename": "debeir.core.parser", "qualname": "XMLParser.__init__", "kind": "function", "doc": "

    \n", "signature": "(id_field: str, parse_fields: List[str], topic_field_name: str)"}, {"fullname": "debeir.core.parser.XMLParser.unwrap", "modulename": "debeir.core.parser", "qualname": "XMLParser.unwrap", "kind": "function", "doc": "

    Converts defaultdict to dict and list of size 1 to just the element

    \n\n
    Parameters
    \n\n
      \n
    • doc_dict:
    • \n
    • key:
    • \n
    \n", "signature": "(cls, doc_dict, key):", "funcdef": "def"}, {"fullname": "debeir.core.parser.CSVParser", "modulename": "debeir.core.parser", "qualname": "CSVParser", "kind": "class", "doc": "

    Loads topics from a CSV file

    \n", "bases": "Parser"}, {"fullname": "debeir.core.parser.CSVParser.__init__", "modulename": "debeir.core.parser", "qualname": "CSVParser.__init__", "kind": "function", "doc": "

    \n", "signature": "(id_field=None, parse_fields=None)"}, {"fullname": "debeir.core.parser.TSVParser", "modulename": "debeir.core.parser", "qualname": "TSVParser", "kind": "class", "doc": "

    \n", "bases": "CSVParser"}, {"fullname": "debeir.core.parser.TSVParser.__init__", "modulename": "debeir.core.parser", "qualname": "TSVParser.__init__", "kind": "function", "doc": "

    \n", "signature": "(id_field: object, parse_fields: List[str])"}, {"fullname": "debeir.core.parser.JsonLinesParser", "modulename": "debeir.core.parser", "qualname": "JsonLinesParser", "kind": "class", "doc": "

    Loads topics from a jsonl file,\na JSON per line

    \n\n

    Provide parse_fields, id_field and whether to ignore full matches on json keys\nsecondary_id appends to the primary id as jsonlines are flattened structure and may contain duplicate ids.

    \n", "bases": "Parser"}, {"fullname": "debeir.core.parser.JsonLinesParser.__init__", "modulename": "debeir.core.parser", "qualname": "JsonLinesParser.__init__", "kind": "function", "doc": "

    \n", "signature": "(\tid_field: str,\tparse_fields: List[str],\tignore_full_match: bool = True,\tsecondary_id: str = None)"}, {"fullname": "debeir.core.pipeline", "modulename": "debeir.core.pipeline", "kind": "module", "doc": "

    \n"}, {"fullname": "debeir.core.pipeline.Pipeline", "modulename": "debeir.core.pipeline", "qualname": "Pipeline", "kind": "class", "doc": "

    \n"}, {"fullname": "debeir.core.pipeline.Pipeline.__init__", "modulename": "debeir.core.pipeline", "qualname": "Pipeline.__init__", "kind": "function", "doc": "

    \n", "signature": "(\tengine: debeir.core.executor.GenericElasticsearchExecutor,\tengine_name: str,\tmetrics_config,\tengine_config,\tnir_config,\trun_config: debeir.core.config.Config,\tcallbacks=None)"}, {"fullname": "debeir.core.pipeline.Pipeline.disable", "modulename": "debeir.core.pipeline", "qualname": "Pipeline.disable", "kind": "function", "doc": "

    \n", "signature": "(self, parts: list):", "funcdef": "def"}, {"fullname": "debeir.core.pipeline.Pipeline.build_from_config", "modulename": "debeir.core.pipeline", "qualname": "Pipeline.build_from_config", "kind": "function", "doc": "

    \n", "signature": "(cls, nir_config_fp, engine, config_fp) -> debeir.core.pipeline.Pipeline:", "funcdef": "def"}, {"fullname": "debeir.core.pipeline.Pipeline.run_pipeline", "modulename": "debeir.core.pipeline", "qualname": "Pipeline.run_pipeline", "kind": "function", "doc": "

    \n", "signature": "(self, *args, **kwargs):", "funcdef": "async def"}, {"fullname": "debeir.core.pipeline.NIRPipeline", "modulename": "debeir.core.pipeline", "qualname": "NIRPipeline", "kind": "class", "doc": "

    \n", "bases": "Pipeline"}, {"fullname": "debeir.core.pipeline.NIRPipeline.__init__", "modulename": "debeir.core.pipeline", "qualname": "NIRPipeline.__init__", "kind": "function", "doc": "

    \n", "signature": "(*args, **kwargs)"}, {"fullname": "debeir.core.pipeline.NIRPipeline.prehook", "modulename": "debeir.core.pipeline", "qualname": "NIRPipeline.prehook", "kind": "function", "doc": "

    \n", "signature": "(self):", "funcdef": "async def"}, {"fullname": "debeir.core.pipeline.NIRPipeline.run_engine", "modulename": "debeir.core.pipeline", "qualname": "NIRPipeline.run_engine", "kind": "function", "doc": "

    \n", "signature": "(self, *args, **kwargs):", "funcdef": "async def"}, {"fullname": "debeir.core.pipeline.NIRPipeline.posthook", "modulename": "debeir.core.pipeline", "qualname": "NIRPipeline.posthook", "kind": "function", "doc": "

    \n", "signature": "(self, *args, **kwargs):", "funcdef": "async def"}, {"fullname": "debeir.core.pipeline.NIRPipeline.run_pipeline", "modulename": "debeir.core.pipeline", "qualname": "NIRPipeline.run_pipeline", "kind": "function", "doc": "

    \n", "signature": "(self, *args, return_results=False, **kwargs):", "funcdef": "async def"}, {"fullname": "debeir.core.pipeline.NIRPipeline.register_callback", "modulename": "debeir.core.pipeline", "qualname": "NIRPipeline.register_callback", "kind": "function", "doc": "

    \n", "signature": "(self, cb):", "funcdef": "def"}, {"fullname": "debeir.core.pipeline.BM25Pipeline", "modulename": "debeir.core.pipeline", "qualname": "BM25Pipeline", "kind": "class", "doc": "

    \n", "bases": "NIRPipeline"}, {"fullname": "debeir.core.pipeline.BM25Pipeline.run_pipeline", "modulename": "debeir.core.pipeline", "qualname": "BM25Pipeline.run_pipeline", "kind": "function", "doc": "

    \n", "signature": "(self, *args, return_results=False, **kwargs):", "funcdef": "async def"}, {"fullname": "debeir.core.query", "modulename": "debeir.core.query", "kind": "module", "doc": "

    \n"}, {"fullname": "debeir.core.query.Query", "modulename": "debeir.core.query", "qualname": "Query", "kind": "class", "doc": "

    A query interface class

    \n\n
    Parameters
    \n\n
      \n
    • topics: Topics that the query will be composed of
    • \n
    • config: Config object that contains the settings for querying
    • \n
    \n"}, {"fullname": "debeir.core.query.Query.__init__", "modulename": "debeir.core.query", "qualname": "Query.__init__", "kind": "function", "doc": "

    \n", "signature": "(\ttopics: Dict[int, Dict[str, str]],\tconfig: debeir.core.config.GenericConfig)"}, {"fullname": "debeir.core.query.GenericElasticsearchQuery", "modulename": "debeir.core.query", "qualname": "GenericElasticsearchQuery", "kind": "class", "doc": "

    A generic elasticsearch query. Contains methods for NIR-style (embedding) queries and normal BM25 queries.\nRequires topics, configs to be included

    \n", "bases": "Query"}, {"fullname": "debeir.core.query.GenericElasticsearchQuery.__init__", "modulename": "debeir.core.query", "qualname": "GenericElasticsearchQuery.__init__", "kind": "function", "doc": "

    \n", "signature": "(\ttopics,\tconfig,\ttop_bm25_scores=None,\tmappings=None,\tid_mapping=None,\t*args,\t**kwargs)"}, {"fullname": "debeir.core.query.GenericElasticsearchQuery.generate_query", "modulename": "debeir.core.query", "qualname": "GenericElasticsearchQuery.generate_query", "kind": "function", "doc": "

    Generates a simple BM25 query based off the query facets. Searches over all the document facets.

    \n\n
    Parameters
    \n\n
      \n
    • topic_num:
    • \n
    • args:
    • \n
    • kwargs:
    • \n
    \n\n
    Returns
    \n", "signature": "(self, topic_num, *args, **kwargs):", "funcdef": "def"}, {"fullname": "debeir.core.query.GenericElasticsearchQuery.set_bm25_scores", "modulename": "debeir.core.query", "qualname": "GenericElasticsearchQuery.set_bm25_scores", "kind": "function", "doc": "

    Sets BM25 scores that are used for NIR-style scoring. The top BM25 score for each topic is used\nfor log normalization.

    \n\n

    Score = log(bm25)/log(z) + embed_score

    \n\n
    Parameters
    \n\n
      \n
    • scores: Top BM25 Scores of the form {topic_num: top_bm25_score}
    • \n
    \n", "signature": "(self, scores: Dict[Union[str, int], Union[int, float]]):", "funcdef": "def"}, {"fullname": "debeir.core.query.GenericElasticsearchQuery.has_bm25_scores", "modulename": "debeir.core.query", "qualname": "GenericElasticsearchQuery.has_bm25_scores", "kind": "function", "doc": "

    Checks if BM25 scores have been set

    \n\n
    Returns
    \n", "signature": "(self):", "funcdef": "def"}, {"fullname": "debeir.core.query.GenericElasticsearchQuery.generate_query_embedding", "modulename": "debeir.core.query", "qualname": "GenericElasticsearchQuery.generate_query_embedding", "kind": "function", "doc": "

    Generates an embedding script score query for Elasticsearch as part of the NIR scoring function.

    \n\n
    Parameters
    \n\n
      \n
    • topic_num: The topic number to search for
    • \n
    • encoder: The encoder that will be used for encoding the topics
    • \n
    • norm_weight: The BM25 log normalization constant
    • \n
    • ablations: Whether to execute ablation style queries (i.e. one query facet\nor one document facet at a time)
    • \n
    • cosine_ceiling: Cosine ceiling used for automatic z-log normalization parameter calculation
    • \n
    • args:
    • \n
    • kwargs: Pass disable_cache to disable encoder caching
    • \n
    \n\n
    Returns
    \n\n
    \n
    An elasticsearch script_score query\n
    \n
    \n", "signature": "(self, *args, **kwargs):", "funcdef": "def"}, {"fullname": "debeir.core.query.GenericElasticsearchQuery.get_id_mapping", "modulename": "debeir.core.query", "qualname": "GenericElasticsearchQuery.get_id_mapping", "kind": "function", "doc": "

    Get the document ID

    \n\n
    Parameters
    \n\n
      \n
    • hit: The raw document result
    • \n
    \n\n
    Returns
    \n\n
    \n
    The document's ID\n
    \n
    \n", "signature": "(cls, hit):", "funcdef": "def"}, {"fullname": "debeir.core.results", "modulename": "debeir.core.results", "kind": "module", "doc": "

    \n"}, {"fullname": "debeir.core.results.Results", "modulename": "debeir.core.results", "qualname": "Results", "kind": "class", "doc": "

    \n"}, {"fullname": "debeir.core.results.Results.__init__", "modulename": "debeir.core.results", "qualname": "Results.__init__", "kind": "function", "doc": "

    \n", "signature": "(results: List, query_cls, engine_name)"}, {"fullname": "debeir.core.results.Results.get_topic_ids", "modulename": "debeir.core.results", "qualname": "Results.get_topic_ids", "kind": "function", "doc": "

    \n", "signature": "(self):", "funcdef": "def"}, {"fullname": "debeir.datasets", "modulename": "debeir.datasets", "kind": "module", "doc": "

    Contains data_sets implemented from nir.interfaces

    \n\n
      \n
    1. Parser (For reading data from files into a Dict object)
    2. \n
    3. Query object (Generating queries)\n
        \n
      • These query objects can be very lightweight containing only the mappings of the index.
      • \n
    4. \n
    \n"}, {"fullname": "debeir.datasets.bioreddit", "modulename": "debeir.datasets.bioreddit", "kind": "module", "doc": "

    \n"}, {"fullname": "debeir.datasets.bioreddit.BioRedditSubmissionParser", "modulename": "debeir.datasets.bioreddit", "qualname": "BioRedditSubmissionParser", "kind": "class", "doc": "

    Parser for the BioReddit Submission Dataset

    \n", "bases": "debeir.core.parser.CSVParser"}, {"fullname": "debeir.datasets.bioreddit.BioRedditSubmissionParser.get_topics", "modulename": "debeir.datasets.bioreddit", "qualname": "BioRedditSubmissionParser.get_topics", "kind": "function", "doc": "

    Instance method for getting topics, forwards instance self parameters to the _get_topics class method.

    \n", "signature": "(cls, csvfile) -> Dict[int, Dict[str, str]]:", "funcdef": "def"}, {"fullname": "debeir.datasets.bioreddit.BioRedditCommentParser", "modulename": "debeir.datasets.bioreddit", "qualname": "BioRedditCommentParser", "kind": "class", "doc": "

    Parser for the BioReddit Comment Dataset

    \n", "bases": "debeir.core.parser.CSVParser"}, {"fullname": "debeir.datasets.bioreddit.BioRedditCommentParser.get_topics", "modulename": "debeir.datasets.bioreddit", "qualname": "BioRedditCommentParser.get_topics", "kind": "function", "doc": "

    Instance method for getting topics, forwards instance self parameters to the _get_topics class method.

    \n", "signature": "(cls, csvfile) -> Dict[str, Dict[str, str]]:", "funcdef": "def"}, {"fullname": "debeir.datasets.bioreddit.BioRedditElasticsearchQuery", "modulename": "debeir.datasets.bioreddit", "qualname": "BioRedditElasticsearchQuery", "kind": "class", "doc": "

    Elasticsearch Query object for the BioReddit

    \n", "bases": "debeir.core.query.GenericElasticsearchQuery"}, {"fullname": "debeir.datasets.bioreddit.BioRedditElasticsearchQuery.__init__", "modulename": "debeir.datasets.bioreddit", "qualname": "BioRedditElasticsearchQuery.__init__", "kind": "function", "doc": "

    \n", "signature": "(topics, config, *args, **kwargs)"}, {"fullname": "debeir.datasets.clinical_trials", "modulename": "debeir.datasets.clinical_trials", "kind": "module", "doc": "

    \n"}, {"fullname": "debeir.datasets.clinical_trials.TrialsQueryConfig", "modulename": "debeir.datasets.clinical_trials", "qualname": "TrialsQueryConfig", "kind": "class", "doc": "

    \n", "bases": "debeir.core.config.GenericConfig"}, {"fullname": "debeir.datasets.clinical_trials.TrialsQueryConfig.__init__", "modulename": "debeir.datasets.clinical_trials", "qualname": "TrialsQueryConfig.__init__", "kind": "function", "doc": "

    \n", "signature": "(\tquery_type: str,\tindex: str = None,\tencoder_normalize: bool = True,\tablations: bool = False,\tnorm_weight: float = None,\tautomatic: bool = None,\tencoder: object = None,\tencoder_fp: str = None,\tquery_weights: List[float] = None,\tcosine_weights: List[float] = None,\tevaluate: bool = False,\tqrels: str = None,\tconfig_fn: str = None,\tquery_fn: str = None,\tparser_fn: str = None,\texecutor_fn: str = None,\tcosine_ceiling: float = None,\ttopics_path: str = None,\treturn_id_only: bool = False,\toverwrite_output_if_exists: bool = False,\toutput_file: str = None,\trun_name: str = None,\tquery_field_usage: str = None,\tembed_field_usage: str = None,\tfields: List[str] = None)"}, {"fullname": "debeir.datasets.clinical_trials.TrialsQueryConfig.validate", "modulename": "debeir.datasets.clinical_trials", "qualname": "TrialsQueryConfig.validate", "kind": "function", "doc": "

    Checks if query type is included, and checks if an encoder is included for embedding queries

    \n", "signature": "(self):", "funcdef": "def"}, {"fullname": "debeir.datasets.clinical_trials.TrialsQueryConfig.from_toml", "modulename": "debeir.datasets.clinical_trials", "qualname": "TrialsQueryConfig.from_toml", "kind": "function", "doc": "

    Instantiates a Config object from a toml file

    \n\n
    Parameters
    \n\n
      \n
    • fp: File path of the Config TOML file
    • \n
    • field_class: Class of the Config object to be instantiated
    • \n
    • args: Arguments to be passed to Config
    • \n
    • kwargs: Keyword arguments to be passed
    • \n
    \n\n
    Returns
    \n\n
    \n
    A instantiated and validated Config object.\n
    \n
    \n", "signature": "(cls, fp: str, *args, **kwargs) -> debeir.core.config.GenericConfig:", "funcdef": "def"}, {"fullname": "debeir.datasets.clinical_trials.TrialsQueryConfig.from_dict", "modulename": "debeir.datasets.clinical_trials", "qualname": "TrialsQueryConfig.from_dict", "kind": "function", "doc": "

    Instantiates a Config object from a dictionary

    \n\n
    Parameters
    \n\n
      \n
    • data_class:
    • \n
    • kwargs:
    • \n
    \n\n
    Returns
    \n", "signature": "(cls, **kwargs) -> debeir.core.config.GenericConfig:", "funcdef": "def"}, {"fullname": "debeir.datasets.clinical_trials.TrialsElasticsearchQuery", "modulename": "debeir.datasets.clinical_trials", "qualname": "TrialsElasticsearchQuery", "kind": "class", "doc": "

    Elasticsearch Query object for the Clinical Trials Index

    \n", "bases": "debeir.core.query.GenericElasticsearchQuery"}, {"fullname": "debeir.datasets.clinical_trials.TrialsElasticsearchQuery.__init__", "modulename": "debeir.datasets.clinical_trials", "qualname": "TrialsElasticsearchQuery.__init__", "kind": "function", "doc": "

    \n", "signature": "(topics, query_type, config=None, *args, **kwargs)"}, {"fullname": "debeir.datasets.clinical_trials.TrialsElasticsearchQuery.generate_query", "modulename": "debeir.datasets.clinical_trials", "qualname": "TrialsElasticsearchQuery.generate_query", "kind": "function", "doc": "

    Generates a query for the clinical trials index

    \n\n
    Parameters
    \n\n
      \n
    • topic_num: Topic number to search
    • \n
    • query_field_usage: Which document facets to search over
    • \n
    • kwargs:
    • \n
    \n\n
    Returns
    \n\n
    \n
    A basic elasticsearch query for clinical trials\n
    \n
    \n", "signature": "(self, *args, **kwargs):", "funcdef": "def"}, {"fullname": "debeir.datasets.clinical_trials.TrialsElasticsearchQuery.generate_query_ablation", "modulename": "debeir.datasets.clinical_trials", "qualname": "TrialsElasticsearchQuery.generate_query_ablation", "kind": "function", "doc": "

    Only search one document facet at a time

    \n\n
    Parameters
    \n\n
      \n
    • topic_num:
    • \n
    • kwargs:
    • \n
    \n\n
    Returns
    \n", "signature": "(self, topic_num, **kwargs):", "funcdef": "def"}, {"fullname": "debeir.datasets.clinical_trials.TrialsElasticsearchQuery.generate_query_embedding", "modulename": "debeir.datasets.clinical_trials", "qualname": "TrialsElasticsearchQuery.generate_query_embedding", "kind": "function", "doc": "

    Computes the NIR score for a given topic

    \n\n

    Score = log(BM25)/log(norm_weight) + embedding_score

    \n\n
    Parameters
    \n\n
      \n
    • topic_num:
    • \n
    • encoder:
    • \n
    • query_field_usage:
    • \n
    • embed_field_usage:
    • \n
    • cosine_weights:
    • \n
    • query_weight:
    • \n
    • norm_weight:
    • \n
    • ablations:
    • \n
    • automatic_scores:
    • \n
    • kwargs:
    • \n
    \n\n
    Returns
    \n", "signature": "(self, *args, **kwargs):", "funcdef": "def"}, {"fullname": "debeir.datasets.clinical_trials.TrialsElasticsearchQuery.get_query_type", "modulename": "debeir.datasets.clinical_trials", "qualname": "TrialsElasticsearchQuery.get_query_type", "kind": "function", "doc": "

    \n", "signature": "(self, *args, **kwargs):", "funcdef": "def"}, {"fullname": "debeir.datasets.clinical_trials.TrialsElasticsearchQuery.get_id_mapping", "modulename": "debeir.datasets.clinical_trials", "qualname": "TrialsElasticsearchQuery.get_id_mapping", "kind": "function", "doc": "

    Get the document ID

    \n\n
    Parameters
    \n\n
      \n
    • hit: The raw document result
    • \n
    \n\n
    Returns
    \n\n
    \n
    The document's ID\n
    \n
    \n", "signature": "(self, hit):", "funcdef": "def"}, {"fullname": "debeir.datasets.clinical_trials.ClinicalTrialsElasticsearchExecutor", "modulename": "debeir.datasets.clinical_trials", "qualname": "ClinicalTrialsElasticsearchExecutor", "kind": "class", "doc": "

    Executes queries given a query object.

    \n", "bases": "debeir.core.executor.GenericElasticsearchExecutor"}, {"fullname": "debeir.datasets.clinical_trials.ClinicalTrialsElasticsearchExecutor.__init__", "modulename": "debeir.datasets.clinical_trials", "qualname": "ClinicalTrialsElasticsearchExecutor.__init__", "kind": "function", "doc": "

    \n", "signature": "(\ttopics: Dict[Union[str, int], Dict[str, str]],\tclient: elasticsearch.AsyncElasticsearch,\tindex_name: str,\toutput_file: str,\tquery: debeir.datasets.clinical_trials.TrialsElasticsearchQuery,\tencoder: Optional[debeir.rankers.transformer_sent_encoder.Encoder] = None,\tconfig=None,\t*args,\t**kwargs)"}, {"fullname": "debeir.datasets.clinical_trials.ClinicalTrialParser", "modulename": "debeir.datasets.clinical_trials", "qualname": "ClinicalTrialParser", "kind": "class", "doc": "

    Parser for Clinical Trials topics

    \n", "bases": "debeir.core.parser.Parser"}, {"fullname": "debeir.datasets.clinical_trials.ClinicalTrialParser.get_topics", "modulename": "debeir.datasets.clinical_trials", "qualname": "ClinicalTrialParser.get_topics", "kind": "function", "doc": "

    Instance method for getting topics, forwards instance self parameters to the _get_topics class method.

    \n", "signature": "(cls, csvfile) -> Dict[int, Dict[str, str]]:", "funcdef": "def"}, {"fullname": "debeir.datasets.factory", "modulename": "debeir.datasets.factory", "kind": "module", "doc": "

    \n"}, {"fullname": "debeir.datasets.factory.get_index_name", "modulename": "debeir.datasets.factory", "qualname": "get_index_name", "kind": "function", "doc": "

    Get the index name from the config without parsing as a TOML

    \n\n
    Parameters
    \n\n
      \n
    • config_fp:
    • \n
    \n\n
    Returns
    \n", "signature": "(config_fp):", "funcdef": "def"}, {"fullname": "debeir.datasets.factory.factory_fn", "modulename": "debeir.datasets.factory", "qualname": "factory_fn", "kind": "function", "doc": "

    Factory method for creating the parsed topics, config object, query object and query executor object

    \n\n
    Parameters
    \n\n
      \n
    • config_fp: Config file path
    • \n
    • index: Index to search
    • \n
    \n\n
    Returns
    \n\n
    \n
    Query, Config, Parser, Executor, Evaluator\n
    \n
    \n", "signature": "(\tconfig_fp,\tindex=None) -> (<class 'debeir.core.query.Query'>, <class 'debeir.core.config.GenericConfig'>, <class 'debeir.core.parser.Parser'>, <class 'debeir.core.executor.GenericElasticsearchExecutor'>, <class 'debeir.evaluation.evaluator.Evaluator'>):", "funcdef": "def"}, {"fullname": "debeir.datasets.factory.config_factory", "modulename": "debeir.datasets.factory", "qualname": "config_factory", "kind": "function", "doc": "

    Factory method for creating configs

    \n\n
    Parameters
    \n\n
      \n
    • path: Config path
    • \n
    • config_cls: Config class to instantiate
    • \n
    • args_dict: Arguments to consider
    • \n
    \n\n
    Returns
    \n\n
    \n
    A config object\n
    \n
    \n", "signature": "(\tpath: Union[str, pathlib.Path] = None,\tconfig_cls: Type[debeir.core.config.Config] = None,\targs_dict: Dict = None):", "funcdef": "def"}, {"fullname": "debeir.datasets.factory.get_nir_config", "modulename": "debeir.datasets.factory", "qualname": "get_nir_config", "kind": "function", "doc": "

    \n", "signature": "(nir_config, *args, ignore_errors=False, **kwargs):", "funcdef": "def"}, {"fullname": "debeir.datasets.factory.apply_nir_config", "modulename": "debeir.datasets.factory", "qualname": "apply_nir_config", "kind": "function", "doc": "

    Decorator that applies the NIR config settings to the current function\nReplaces arguments and keywords arguments with those found in the config

    \n\n
    Parameters
    \n\n
      \n
    • func:
    • \n
    \n\n
    Returns
    \n", "signature": "(func):", "funcdef": "def"}, {"fullname": "debeir.datasets.marco", "modulename": "debeir.datasets.marco", "kind": "module", "doc": "

    \n"}, {"fullname": "debeir.datasets.marco.MarcoElasticsearchExecutor", "modulename": "debeir.datasets.marco", "qualname": "MarcoElasticsearchExecutor", "kind": "class", "doc": "

    Generic Executor class for Elasticsearch

    \n", "bases": "debeir.core.executor.GenericElasticsearchExecutor"}, {"fullname": "debeir.datasets.marco.MarcoElasticsearchExecutor.__init__", "modulename": "debeir.datasets.marco", "qualname": "MarcoElasticsearchExecutor.__init__", "kind": "function", "doc": "

    \n", "signature": "(\ttopics: Dict[Union[str, int], Dict[str, str]],\tclient: elasticsearch.AsyncElasticsearch,\tindex_name: str,\toutput_file: str,\tquery: debeir.core.query.GenericElasticsearchQuery,\tencoder: Optional[debeir.rankers.transformer_sent_encoder.Encoder] = None,\tconfig=None,\t*args,\t**kwargs)"}, {"fullname": "debeir.datasets.marco.MarcoElasticsearchExecutor.generate_query", "modulename": "debeir.datasets.marco", "qualname": "MarcoElasticsearchExecutor.generate_query", "kind": "function", "doc": "

    Generates a standard BM25 query given the topic number

    \n\n
    Parameters
    \n\n
      \n
    • topic_num: Query topic number to generate
    • \n
    • best_fields: Whether to use a curated list of fields
    • \n
    • kwargs:
    • \n
    \n\n
    Returns
    \n", "signature": "(self, topic_num, best_fields=True, **kwargs):", "funcdef": "def"}, {"fullname": "debeir.datasets.marco.MarcoElasticsearchExecutor.generate_embedding_query", "modulename": "debeir.datasets.marco", "qualname": "MarcoElasticsearchExecutor.generate_embedding_query", "kind": "function", "doc": "

    Executes an NIR-style query with combined scoring.

    \n\n
    Parameters
    \n\n
      \n
    • topic_num:
    • \n
    • cosine_weights:
    • \n
    • query_weights:
    • \n
    • norm_weight:
    • \n
    • automatic_scores:
    • \n
    • kwargs:
    • \n
    \n\n
    Returns
    \n", "signature": "(\tself,\ttopic_num,\tcosine_weights=None,\tquery_weights=None,\tnorm_weight=2.15,\tautomatic_scores=None,\t**kwargs):", "funcdef": "def"}, {"fullname": "debeir.datasets.marco.MarcoElasticsearchExecutor.execute_query", "modulename": "debeir.datasets.marco", "qualname": "MarcoElasticsearchExecutor.execute_query", "kind": "function", "doc": "

    Execute a query given parameters

    \n\n
    Parameters
    \n\n
      \n
    • args:
    • \n
    • kwargs:
    • \n
    \n", "signature": "(\tself,\tquery=None,\ttopic_num=None,\tablation=False,\tquery_type='query',\t**kwargs):", "funcdef": "async def"}, {"fullname": "debeir.datasets.marco.MarcoQueryConfig", "modulename": "debeir.datasets.marco", "qualname": "MarcoQueryConfig", "kind": "class", "doc": "

    \n", "bases": "debeir.core.config.GenericConfig"}, {"fullname": "debeir.datasets.marco.MarcoQueryConfig.__init__", "modulename": "debeir.datasets.marco", "qualname": "MarcoQueryConfig.__init__", "kind": "function", "doc": "

    \n", "signature": "(\tquery_type: str,\tindex: str = None,\tencoder_normalize: bool = True,\tablations: bool = False,\tnorm_weight: float = None,\tautomatic: bool = None,\tencoder: object = None,\tencoder_fp: str = None,\tquery_weights: List[float] = None,\tcosine_weights: List[float] = None,\tevaluate: bool = False,\tqrels: str = None,\tconfig_fn: str = None,\tquery_fn: str = None,\tparser_fn: str = None,\texecutor_fn: str = None,\tcosine_ceiling: float = None,\ttopics_path: str = None,\treturn_id_only: bool = False,\toverwrite_output_if_exists: bool = False,\toutput_file: str = None,\trun_name: str = None)"}, {"fullname": "debeir.datasets.marco.MarcoQueryConfig.validate", "modulename": "debeir.datasets.marco", "qualname": "MarcoQueryConfig.validate", "kind": "function", "doc": "

    Validates if the config is correct.\nMust be implemented by inherited classes.

    \n", "signature": "(self):", "funcdef": "def"}, {"fullname": "debeir.datasets.marco.MarcoQueryConfig.from_toml", "modulename": "debeir.datasets.marco", "qualname": "MarcoQueryConfig.from_toml", "kind": "function", "doc": "

    Instantiates a Config object from a toml file

    \n\n
    Parameters
    \n\n
      \n
    • fp: File path of the Config TOML file
    • \n
    • field_class: Class of the Config object to be instantiated
    • \n
    • args: Arguments to be passed to Config
    • \n
    • kwargs: Keyword arguments to be passed
    • \n
    \n\n
    Returns
    \n\n
    \n
    A instantiated and validated Config object.\n
    \n
    \n", "signature": "(cls, fp: str, *args, **kwargs) -> debeir.datasets.marco.MarcoQueryConfig:", "funcdef": "def"}, {"fullname": "debeir.datasets.marco.MarcoQueryConfig.from_dict", "modulename": "debeir.datasets.marco", "qualname": "MarcoQueryConfig.from_dict", "kind": "function", "doc": "

    Instantiates a Config object from a dictionary

    \n\n
    Parameters
    \n\n
      \n
    • data_class:
    • \n
    • kwargs:
    • \n
    \n\n
    Returns
    \n", "signature": "(cls, **kwargs) -> debeir.datasets.marco.MarcoQueryConfig:", "funcdef": "def"}, {"fullname": "debeir.datasets.trec_clinical_trials", "modulename": "debeir.datasets.trec_clinical_trials", "kind": "module", "doc": "

    \n"}, {"fullname": "debeir.datasets.trec_clinical_trials.TREClinicalTrialDocumentParser", "modulename": "debeir.datasets.trec_clinical_trials", "qualname": "TREClinicalTrialDocumentParser", "kind": "class", "doc": "

    Parser for Clinical Trials topics

    \n", "bases": "debeir.core.parser.XMLParser"}, {"fullname": "debeir.datasets.trec_clinical_trials.TREClinicalTrialDocumentParser.extract", "modulename": "debeir.datasets.trec_clinical_trials", "qualname": "TREClinicalTrialDocumentParser.extract", "kind": "function", "doc": "

    \n", "signature": "(cls, path) -> Dict:", "funcdef": "def"}, {"fullname": "debeir.datasets.trec_clinical_trials.TrecClincialElasticsearchQuery", "modulename": "debeir.datasets.trec_clinical_trials", "qualname": "TrecClincialElasticsearchQuery", "kind": "class", "doc": "

    A generic elasticsearch query. Contains methods for NIR-style (embedding) queries and normal BM25 queries.\nRequires topics, configs to be included

    \n", "bases": "debeir.core.query.GenericElasticsearchQuery"}, {"fullname": "debeir.datasets.trec_clinical_trials.TrecClincialElasticsearchQuery.__init__", "modulename": "debeir.datasets.trec_clinical_trials", "qualname": "TrecClincialElasticsearchQuery.__init__", "kind": "function", "doc": "

    \n", "signature": "(topics, config, *args, **kwargs)"}, {"fullname": "debeir.datasets.trec_covid", "modulename": "debeir.datasets.trec_covid", "kind": "module", "doc": "

    \n"}, {"fullname": "debeir.datasets.trec_covid.TrecCovidParser", "modulename": "debeir.datasets.trec_covid", "qualname": "TrecCovidParser", "kind": "class", "doc": "

    Load topics from an XML file

    \n", "bases": "debeir.core.parser.XMLParser"}, {"fullname": "debeir.datasets.trec_covid.TrecCovidParser.get_topics", "modulename": "debeir.datasets.trec_covid", "qualname": "TrecCovidParser.get_topics", "kind": "function", "doc": "

    Instance method for getting topics, forwards instance self parameters to the _get_topics class method.

    \n", "signature": "(cls, xmlfile) -> Dict[int, Dict[str, str]]:", "funcdef": "def"}, {"fullname": "debeir.datasets.trec_covid.TrecElasticsearchQuery", "modulename": "debeir.datasets.trec_covid", "qualname": "TrecElasticsearchQuery", "kind": "class", "doc": "

    A generic elasticsearch query. Contains methods for NIR-style (embedding) queries and normal BM25 queries.\nRequires topics, configs to be included

    \n", "bases": "debeir.core.query.GenericElasticsearchQuery"}, {"fullname": "debeir.datasets.trec_covid.TrecElasticsearchQuery.__init__", "modulename": "debeir.datasets.trec_covid", "qualname": "TrecElasticsearchQuery.__init__", "kind": "function", "doc": "

    \n", "signature": "(topics, config, *args, **kwargs)"}, {"fullname": "debeir.datasets.types", "modulename": "debeir.datasets.types", "kind": "module", "doc": "

    \n"}, {"fullname": "debeir.datasets.types.InputExample", "modulename": "debeir.datasets.types", "qualname": "InputExample", "kind": "class", "doc": "

    Copied from Sentence Transformer Library\nStructure for one input example with texts, the label and a unique id

    \n"}, {"fullname": "debeir.datasets.types.InputExample.__init__", "modulename": "debeir.datasets.types", "qualname": "InputExample.__init__", "kind": "function", "doc": "

    Creates one InputExample with the given texts, guid and label

    \n\n

    :param guid\n id for the example\n:param texts\n the texts for the example. Note, str.strip() is called on the texts\n:param label\n the label for the example

    \n", "signature": "(\tguid: str = '',\ttexts: List[str] = None,\tlabel: Union[int, float] = 0)"}, {"fullname": "debeir.datasets.types.InputExample.get_label", "modulename": "debeir.datasets.types", "qualname": "InputExample.get_label", "kind": "function", "doc": "

    \n", "signature": "(self):", "funcdef": "def"}, {"fullname": "debeir.datasets.types.InputExample.to_dict", "modulename": "debeir.datasets.types", "qualname": "InputExample.to_dict", "kind": "function", "doc": "

    \n", "signature": "(cls, data: List[debeir.datasets.types.InputExample]):", "funcdef": "def"}, {"fullname": "debeir.datasets.types.InputExample.from_parser_output", "modulename": "debeir.datasets.types", "qualname": "InputExample.from_parser_output", "kind": "function", "doc": "

    \n", "signature": "(cls, data):", "funcdef": "def"}, {"fullname": "debeir.datasets.types.RelevanceExample", "modulename": "debeir.datasets.types", "qualname": "RelevanceExample", "kind": "class", "doc": "

    Converts Relevance Labels to 0 - 1

    \n", "bases": "InputExample"}, {"fullname": "debeir.datasets.types.RelevanceExample.__init__", "modulename": "debeir.datasets.types", "qualname": "RelevanceExample.__init__", "kind": "function", "doc": "

    Creates one InputExample with the given texts, guid and label

    \n\n

    :param guid\n id for the example\n:param texts\n the texts for the example. Note, str.strip() is called on the texts\n:param label\n the label for the example

    \n", "signature": "(max_score=2, *args, **kwargs)"}, {"fullname": "debeir.datasets.types.RelevanceExample.get_label", "modulename": "debeir.datasets.types", "qualname": "RelevanceExample.get_label", "kind": "function", "doc": "

    \n", "signature": "(self):", "funcdef": "def"}, {"fullname": "debeir.datasets.types.RelevanceExample.relevance", "modulename": "debeir.datasets.types", "qualname": "RelevanceExample.relevance", "kind": "function", "doc": "
    Returns
    \n\n
    \n
    Returns a normalised score for relevance between 0 - 1\n
    \n
    \n", "signature": "(self):", "funcdef": "def"}, {"fullname": "debeir.datasets.types.DatasetTypes", "modulename": "debeir.datasets.types", "qualname": "DatasetTypes", "kind": "class", "doc": "

    A collection of common dataset types that is usable in the library.

    \n", "bases": "enum.Enum"}, {"fullname": "debeir.datasets.utils", "modulename": "debeir.datasets.utils", "kind": "module", "doc": "

    \n"}, {"fullname": "debeir.datasets.utils.CrossValidatorDataset", "modulename": "debeir.datasets.utils", "qualname": "CrossValidatorDataset", "kind": "class", "doc": "

    Cross Validator Dataset

    \n"}, {"fullname": "debeir.datasets.utils.CrossValidatorDataset.__init__", "modulename": "debeir.datasets.utils", "qualname": "CrossValidatorDataset.__init__", "kind": "function", "doc": "

    \n", "signature": "(dataset, cross_validator, n_folds, x_attr='text', y_attr='label')"}, {"fullname": "debeir.datasets.utils.CrossValidatorDataset.prepare_cross_validator", "modulename": "debeir.datasets.utils", "qualname": "CrossValidatorDataset.prepare_cross_validator", "kind": "function", "doc": "

    Prepare the cross validator dataset object that will internally produce the folds.

    \n\n
    Parameters
    \n\n
      \n
    • data: Dataset to be used. Should be a list of dicts, or list of [x,y] or a Dataset object from data_sets
    • \n
    • evaluator: Evaluator to use for checking results
    • \n
    • n_splits: Number of cross validation splits, k-fold (stratified)
    • \n
    • seed: Seed to use (default 42)
    • \n
    • y_attr: Label, or idx of the y label
    • \n
    • x_attr: Label or idx of the x label (not directly used)
    • \n
    \n", "signature": "(\tcls,\tdata,\tevaluator: debeir.evaluation.evaluator.Evaluator,\tn_splits: int,\tx_attr,\ty_attr,\tseed=42) -> debeir.datasets.utils.CrossValidatorDataset:", "funcdef": "def"}, {"fullname": "debeir.datasets.utils.CrossValidatorDataset.get_fold", "modulename": "debeir.datasets.utils", "qualname": "CrossValidatorDataset.get_fold", "kind": "function", "doc": "

    Get the fold and returns a dataset.DataDict object with\nDataDict{'train': ..., 'val': ...}

    \n\n
    Parameters
    \n\n
      \n
    • idx:
    • \n
    \n", "signature": "(self, idx) -> datasets.dataset_dict.DatasetDict:", "funcdef": "def"}, {"fullname": "debeir.engines", "modulename": "debeir.engines", "kind": "module", "doc": "

    WIP

    \n\n

    Implemented Search Engines to run queries against.

    \n"}, {"fullname": "debeir.engines.client", "modulename": "debeir.engines.client", "kind": "module", "doc": "

    \n"}, {"fullname": "debeir.engines.client.Client", "modulename": "debeir.engines.client", "qualname": "Client", "kind": "class", "doc": "

    Overarching client interface object that contains references to different clients for search\nAllows sharing between function calls

    \n"}, {"fullname": "debeir.engines.client.Client.__init__", "modulename": "debeir.engines.client", "qualname": "Client.__init__", "kind": "function", "doc": "

    \n", "signature": "(\tes_client: elasticsearch.AsyncElasticsearch = None,\tsolr_client: object = None,\tgeneric_client: object = None)"}, {"fullname": "debeir.engines.client.Client.build_from_config", "modulename": "debeir.engines.client", "qualname": "Client.build_from_config", "kind": "function", "doc": "

    Build client from engine config

    \n\n
    Parameters
    \n\n
      \n
    • engine_type:
    • \n
    • engine_config:
    • \n
    \n\n
    Returns
    \n", "signature": "(cls, engine_type, engine_config) -> debeir.engines.client.Client:", "funcdef": "def"}, {"fullname": "debeir.engines.client.Client.get_client", "modulename": "debeir.engines.client", "qualname": "Client.get_client", "kind": "function", "doc": "

    \n", "signature": "(self, engine):", "funcdef": "def"}, {"fullname": "debeir.engines.client.Client.close", "modulename": "debeir.engines.client", "qualname": "Client.close", "kind": "function", "doc": "

    Generically close all contained client objects

    \n", "signature": "(self):", "funcdef": "async def"}, {"fullname": "debeir.engines.dummyindex", "modulename": "debeir.engines.dummyindex", "kind": "module", "doc": "

    \n"}, {"fullname": "debeir.engines.dummyindex.index", "modulename": "debeir.engines.dummyindex.index", "kind": "module", "doc": "

    \n"}, {"fullname": "debeir.engines.dummyindex.index.DummyIndex", "modulename": "debeir.engines.dummyindex.index", "qualname": "DummyIndex", "kind": "class", "doc": "

    \n"}, {"fullname": "debeir.engines.dummyindex.index.DummyIndex.__init__", "modulename": "debeir.engines.dummyindex.index", "qualname": "DummyIndex.__init__", "kind": "function", "doc": "

    \n", "signature": "()"}, {"fullname": "debeir.engines.dummyindex.index.DummyIndex.get_documents", "modulename": "debeir.engines.dummyindex.index", "qualname": "DummyIndex.get_documents", "kind": "function", "doc": "

    \n", "signature": "(self):", "funcdef": "def"}, {"fullname": "debeir.engines.dummyindex.index.DummyIndex.query", "modulename": "debeir.engines.dummyindex.index", "qualname": "DummyIndex.query", "kind": "function", "doc": "

    \n", "signature": "(self):", "funcdef": "def"}, {"fullname": "debeir.engines.dummyindex.index.DummyIndex.scorer", "modulename": "debeir.engines.dummyindex.index", "qualname": "DummyIndex.scorer", "kind": "function", "doc": "

    \n", "signature": "(self):", "funcdef": "def"}, {"fullname": "debeir.engines.dummyindex.index.es_isup", "modulename": "debeir.engines.dummyindex.index", "qualname": "es_isup", "kind": "function", "doc": "

    \n", "signature": "(es_client: elasticsearch.AsyncElasticsearch):", "funcdef": "async def"}, {"fullname": "debeir.engines.elasticsearch", "modulename": "debeir.engines.elasticsearch", "kind": "module", "doc": "

    Library code for interacting with the elasticsearch engine

    \n\n

    Contains many helper functions for asynchronous and fast querying, with optional caching available

    \n"}, {"fullname": "debeir.engines.elasticsearch.change_bm25", "modulename": "debeir.engines.elasticsearch.change_bm25", "kind": "module", "doc": "

    \n"}, {"fullname": "debeir.engines.elasticsearch.change_bm25.change_bm25_params", "modulename": "debeir.engines.elasticsearch.change_bm25", "qualname": "change_bm25_params", "kind": "function", "doc": "

    Change the BM25 parameters of the elasticsearch BM25 ranker.

    \n\n
    Parameters
    \n\n
      \n
    • index: The elasticsearch index name
    • \n
    • k1: The k parameter for BM25 (default 1.2) [Usually 0-3] [Term saturation constant] ->\nThe higher the k value, the more weight given to document that repeat terms.
    • \n
    • b: The b parameter for BM25 (default 0.75) [Usually 0-1] [Document length constant] ->\nThe higher the b value, the higher it penalises longer documents.
    • \n
    • base_url: The elasticsearch base URL for API requests (without index suffix)
    • \n
    \n", "signature": "(index, k1: float, b: float, base_url: str = 'http://localhost:9200'):", "funcdef": "def"}, {"fullname": "debeir.engines.elasticsearch.executor", "modulename": "debeir.engines.elasticsearch.executor", "kind": "module", "doc": "

    \n"}, {"fullname": "debeir.engines.elasticsearch.executor.ElasticsearchExecutor", "modulename": "debeir.engines.elasticsearch.executor", "qualname": "ElasticsearchExecutor", "kind": "class", "doc": "

    Executes an elasticsearch query given the query generated from the config, topics and query class object.

    \n\n

    Computes regular patterns of queries expected from general IR topics and indexes.\nIncludes:\n 1. Reranking\n 2. End-to-End Neural IR\n 3. Statistical keyword matching

    \n"}, {"fullname": "debeir.engines.elasticsearch.executor.ElasticsearchExecutor.__init__", "modulename": "debeir.engines.elasticsearch.executor", "qualname": "ElasticsearchExecutor.__init__", "kind": "function", "doc": "

    \n", "signature": "(\ttopics: Dict[Union[str, int], Dict[str, str]],\tclient: elasticsearch.AsyncElasticsearch,\tindex_name: str,\toutput_file: str,\tquery: debeir.core.query.GenericElasticsearchQuery,\tencoder: Optional[debeir.rankers.transformer_sent_encoder.Encoder],\treturn_size: int = 1000,\ttest=False,\treturn_id_only=True,\tconfig=None)"}, {"fullname": "debeir.engines.elasticsearch.executor.ElasticsearchExecutor.generate_query", "modulename": "debeir.engines.elasticsearch.executor", "qualname": "ElasticsearchExecutor.generate_query", "kind": "function", "doc": "

    Generates a query given a topic number from the list of topics

    \n\n
    Parameters
    \n\n
      \n
    • topic_num:
    • \n
    \n", "signature": "(self, topic_num):", "funcdef": "def"}, {"fullname": "debeir.engines.elasticsearch.executor.ElasticsearchExecutor.execute_query", "modulename": "debeir.engines.elasticsearch.executor", "qualname": "ElasticsearchExecutor.execute_query", "kind": "function", "doc": "

    Execute a query given parameters

    \n\n
    Parameters
    \n\n
      \n
    • args:
    • \n
    • kwargs:
    • \n
    \n", "signature": "(self, *args, **kwargs):", "funcdef": "def"}, {"fullname": "debeir.engines.elasticsearch.executor.ElasticsearchExecutor.run_all_queries", "modulename": "debeir.engines.elasticsearch.executor", "qualname": "ElasticsearchExecutor.run_all_queries", "kind": "function", "doc": "

    A generic function that will asynchronously run all topics using the execute_query() method

    \n\n
    Parameters
    \n\n
      \n
    • query_type: Which query to execute. Query_type determines which method is used to generate the queries\nfrom self.query.query_funcs: Dict[str, func]
    • \n
    • return_results: Whether to return raw results from the client. Useful for analysing results directly or\nfor computing the BM25 scores for log normalization in NIR-style scoring
    • \n
    • return_size: Number of documents to return. Overrides the config value if exists.
    • \n
    • return_id_only: Return the ID of the document only, rather than the full source document.
    • \n
    • args: Arguments to pass to the execute_query method
    • \n
    • kwargs: Keyword arguments to pass to the execute_query method
    • \n
    \n\n
    Returns
    \n\n
    \n
    A list of results if return_results = True else an empty list is returned.\n
    \n
    \n", "signature": "(\tself,\tquery_type=None,\treturn_results=False,\treturn_size: int = None,\treturn_id_only: bool = False,\t**kwargs) -> List:", "funcdef": "async def"}, {"fullname": "debeir.engines.elasticsearch.generate_script_score", "modulename": "debeir.engines.elasticsearch.generate_script_score", "kind": "module", "doc": "

    \n"}, {"fullname": "debeir.engines.elasticsearch.generate_script_score.SourceBuilder", "modulename": "debeir.engines.elasticsearch.generate_script_score", "qualname": "SourceBuilder", "kind": "class", "doc": "

    Builds Script Score source for NIR-style queries in elasticsearch\nUses the painless language

    \n\n

    This is a string builder class

    \n"}, {"fullname": "debeir.engines.elasticsearch.generate_script_score.SourceBuilder.__init__", "modulename": "debeir.engines.elasticsearch.generate_script_score", "qualname": "SourceBuilder.__init__", "kind": "function", "doc": "

    \n", "signature": "()"}, {"fullname": "debeir.engines.elasticsearch.generate_script_score.SourceBuilder.add_preamble", "modulename": "debeir.engines.elasticsearch.generate_script_score", "qualname": "SourceBuilder.add_preamble", "kind": "function", "doc": "

    Adds preamble to the internal string\nThis will return the bm25 score if the normalization constant is below 0

    \n", "signature": "(self):", "funcdef": "def"}, {"fullname": "debeir.engines.elasticsearch.generate_script_score.SourceBuilder.add_log_score", "modulename": "debeir.engines.elasticsearch.generate_script_score", "qualname": "SourceBuilder.add_log_score", "kind": "function", "doc": "

    Adds the BM25 log score line

    \n\n
    Parameters
    \n\n
      \n
    • ignore_below_one: Ignore all scores below 1.0 as Log(1) = 0. Otherwise, just ignore Log(0 and under).
    • \n
    \n\n
    Returns
    \n\n
    \n
    SourceBuilder\n
    \n
    \n", "signature": "(\tself,\tignore_below_one=False) -> debeir.engines.elasticsearch.generate_script_score.SourceBuilder:", "funcdef": "def"}, {"fullname": "debeir.engines.elasticsearch.generate_script_score.SourceBuilder.add_embed_field", "modulename": "debeir.engines.elasticsearch.generate_script_score", "qualname": "SourceBuilder.add_embed_field", "kind": "function", "doc": "

    Adds a cosine score line.

    \n\n
    Parameters
    \n\n
      \n
    • qfield: Query field
    • \n
    • field: Document facet field
    • \n
    \n\n
    Returns
    \n", "signature": "(\tself,\tqfield,\tfield) -> debeir.engines.elasticsearch.generate_script_score.SourceBuilder:", "funcdef": "def"}, {"fullname": "debeir.engines.elasticsearch.generate_script_score.SourceBuilder.finish", "modulename": "debeir.engines.elasticsearch.generate_script_score", "qualname": "SourceBuilder.finish", "kind": "function", "doc": "

    Finalises the script score and returns the internal string

    \n\n
    Returns
    \n\n
    \n
    A string containing the script score query\n
    \n
    \n", "signature": "(self):", "funcdef": "def"}, {"fullname": "debeir.engines.elasticsearch.generate_script_score.generate_source", "modulename": "debeir.engines.elasticsearch.generate_script_score", "qualname": "generate_source", "kind": "function", "doc": "

    Generates the script source based off a set of input fields and facets

    \n\n
    Parameters
    \n\n
      \n
    • qfields: Query fields (or topic fields)
    • \n
    • fields: Document facets to compute cosine similarity on
    • \n
    \n\n
    Returns
    \n", "signature": "(qfields: Union[list, str], fields) -> str:", "funcdef": "def"}, {"fullname": "debeir.engines.elasticsearch.generate_script_score.check_params_is_valid", "modulename": "debeir.engines.elasticsearch.generate_script_score", "qualname": "check_params_is_valid", "kind": "function", "doc": "

    Validate if the parameters for the script score passes a simple sanity check.

    \n\n
    Parameters
    \n\n
      \n
    • params:
    • \n
    • qfields:
    • \n
    \n", "signature": "(params, qfields):", "funcdef": "def"}, {"fullname": "debeir.engines.elasticsearch.generate_script_score.generate_script", "modulename": "debeir.engines.elasticsearch.generate_script_score", "qualname": "generate_script", "kind": "function", "doc": "

    Parameters for creating the script

    \n\n
    Parameters
    \n\n
      \n
    • fields: Document fields to search
    • \n
    • params: Parameters for the script
    • \n
    • source_generator: Function that will generate the script
    • \n
    • qfields: Query fields to search from (topic facets)
    • \n
    \n\n
    Returns
    \n", "signature": "(\tfields,\tparams,\tsource_generator=<function generate_source>,\tqfields='q_eb') -> Dict:", "funcdef": "def"}, {"fullname": "debeir.engines.solr", "modulename": "debeir.engines.solr", "kind": "module", "doc": "

    \n"}, {"fullname": "debeir.evaluation", "modulename": "debeir.evaluation", "kind": "module", "doc": "

    Evaluation for retrieved results.

    \n\n

    Works for TREC-style queries or for out-the-box returned results from the implemented search engines.

    \n"}, {"fullname": "debeir.evaluation.cross_validation", "modulename": "debeir.evaluation.cross_validation", "kind": "module", "doc": "

    \n"}, {"fullname": "debeir.evaluation.cross_validation.split_k_fold", "modulename": "debeir.evaluation.cross_validation", "qualname": "split_k_fold", "kind": "function", "doc": "

    \n", "signature": "(n_fold, data_files):", "funcdef": "def"}, {"fullname": "debeir.evaluation.cross_validation.CrossValidatorTypes", "modulename": "debeir.evaluation.cross_validation", "qualname": "CrossValidatorTypes", "kind": "class", "doc": "

    Cross Validator Strategies for separating the dataset

    \n", "bases": "enum.Enum"}, {"fullname": "debeir.evaluation.cross_validation.CrossValidatorTypes.Stratified", "modulename": "debeir.evaluation.cross_validation", "qualname": "CrossValidatorTypes.Stratified", "kind": "variable", "doc": "

    \n", "default_value": " = <CrossValidatorTypes.Stratified: 'StratifiedKFold'>"}, {"fullname": "debeir.evaluation.cross_validation.CrossValidatorTypes.KFold", "modulename": "debeir.evaluation.cross_validation", "qualname": "CrossValidatorTypes.KFold", "kind": "variable", "doc": "

    \n", "default_value": " = <CrossValidatorTypes.KFold: 'KFold'>"}, {"fullname": "debeir.evaluation.cross_validation.CrossValidator", "modulename": "debeir.evaluation.cross_validation", "qualname": "CrossValidator", "kind": "class", "doc": "

    Cross Validator Class for different types of data_sets

    \n\n

    E.g. List -> [[Data], label]\n List[Dict] -> {\"data\": Data, \"label\": label}\n Huggingface Dataset Object -> Data(set=\"train\", label = \"label\").select(idx)

    \n"}, {"fullname": "debeir.evaluation.cross_validation.CrossValidator.__init__", "modulename": "debeir.evaluation.cross_validation", "qualname": "CrossValidator.__init__", "kind": "function", "doc": "

    \n", "signature": "(\tdataset: Union[List, List[Dict], datasets.arrow_dataset.Dataset],\tx_idx_label_or_attr: Union[str, int],\ty_idx_label_or_attr: Union[str, int],\tcross_validator_type: [<class 'str'>, <enum 'CrossValidatorTypes'>] = <CrossValidatorTypes.Stratified: 'StratifiedKFold'>,\tseed=42,\tn_splits=5)"}, {"fullname": "debeir.evaluation.cross_validation.CrossValidator.get_fold", "modulename": "debeir.evaluation.cross_validation", "qualname": "CrossValidator.get_fold", "kind": "function", "doc": "
    Parameters
    \n\n
      \n
    • fold_num: Which fold to pick
    • \n
    \n\n
    Returns
    \n", "signature": "(self, fold_num: int):", "funcdef": "def"}, {"fullname": "debeir.evaluation.evaluator", "modulename": "debeir.evaluation.evaluator", "kind": "module", "doc": "

    \n"}, {"fullname": "debeir.evaluation.evaluator.Evaluator", "modulename": "debeir.evaluation.evaluator", "qualname": "Evaluator", "kind": "class", "doc": "

    Evaluation class for computing metrics from TREC-style files

    \n"}, {"fullname": "debeir.evaluation.evaluator.Evaluator.__init__", "modulename": "debeir.evaluation.evaluator", "qualname": "Evaluator.__init__", "kind": "function", "doc": "

    \n", "signature": "(qrels: str, metrics: List[str])"}, {"fullname": "debeir.evaluation.evaluator.Evaluator.evaluate_runs", "modulename": "debeir.evaluation.evaluator", "qualname": "Evaluator.evaluate_runs", "kind": "function", "doc": "

    Evaluates the TREC-style results from an input result list or file

    \n\n
    Parameters
    \n\n
      \n
    • res: Results file path or raw results list
    • \n
    • kwargs: Keyword arguments to pass to the underlying analysis_tools_ir.parse_run library
    • \n
    \n\n
    Returns
    \n", "signature": "(self, res: Union[str, List[str]], **kwargs):", "funcdef": "def"}, {"fullname": "debeir.evaluation.evaluator.Evaluator.average_all_metrics", "modulename": "debeir.evaluation.evaluator", "qualname": "Evaluator.average_all_metrics", "kind": "function", "doc": "

    Averages the metric per topic scores into a single averaged score.

    \n\n
    Parameters
    \n\n
      \n
    • runs: Parsed run dictionary: {metric_name@depth: Run object}
    • \n
    • logger: Logger to print metrics
    • \n
    \n", "signature": "(\tself,\truns: Dict,\tlogger: <loguru.logger handlers=[(id=0, level=10, sink=<_io.StringIO object at 0x105cfa710>)]>):", "funcdef": "def"}, {"fullname": "debeir.evaluation.evaluator.Evaluator.sigtests", "modulename": "debeir.evaluation.evaluator", "qualname": "Evaluator.sigtests", "kind": "function", "doc": "

    Run a paired significance test on two result files

    \n\n
    Parameters
    \n\n
      \n
    • results_a:
    • \n
    • results_b:
    • \n
    \n\n
    Returns
    \n", "signature": "(self, results_a, results_b):", "funcdef": "def"}, {"fullname": "debeir.evaluation.evaluator.Evaluator.build_from_config", "modulename": "debeir.evaluation.evaluator", "qualname": "Evaluator.build_from_config", "kind": "function", "doc": "

    \n", "signature": "(\tcls,\tconfig: debeir.core.config.GenericConfig,\tmetrics_config: debeir.core.config.MetricsConfig):", "funcdef": "def"}, {"fullname": "debeir.evaluation.residual_scoring", "modulename": "debeir.evaluation.residual_scoring", "kind": "module", "doc": "

    \n"}, {"fullname": "debeir.evaluation.residual_scoring.ResidualEvaluator", "modulename": "debeir.evaluation.residual_scoring", "qualname": "ResidualEvaluator", "kind": "class", "doc": "

    Residual Scoring is the scoring of a subset of documents or the residiaul. The residual is created by removing documents from the collection and qrels.

    \n", "bases": "debeir.evaluation.evaluator.Evaluator"}, {"fullname": "debeir.evaluation.residual_scoring.ResidualEvaluator.__init__", "modulename": "debeir.evaluation.residual_scoring", "qualname": "ResidualEvaluator.__init__", "kind": "function", "doc": "

    Args:\n qrels (str): Path to qrels \n metrics (List[str]): A list of metrics with depth e.g. NDCG@1000\n filter_ids (Dict[str, List[str]]): A list of IDs to remove from the collection given by Dict[Topic_num, [Docids]]

    \n", "signature": "(qrels: str, metrics: List[str], filter_ids: Dict[str, List[str]])"}, {"fullname": "debeir.evaluation.residual_scoring.ResidualEvaluator.evaluate_runs", "modulename": "debeir.evaluation.residual_scoring", "qualname": "ResidualEvaluator.evaluate_runs", "kind": "function", "doc": "

    Run the residual evaluation for the runs

    \n\n
    Parameters
    \n\n
      \n
    • res: The results to run the evaluator against
    • \n
    • with_trec_binary: Use the TREC C binary instead of the default Python library, defaults to False
    • \n
    \n\n
    Returns
    \n\n
    \n

    A dictionary of supplied metrics of the results against the qrels

    \n
    \n", "signature": "(self, res: Union[str, List[str]], with_trec_binary=False, **kwargs):", "funcdef": "def"}, {"fullname": "debeir.models", "modulename": "debeir.models", "kind": "module", "doc": "

    \n"}, {"fullname": "debeir.models.colbert", "modulename": "debeir.models.colbert", "kind": "module", "doc": "

    \n"}, {"fullname": "debeir.models.colbert.CoLBERTConfig", "modulename": "debeir.models.colbert", "qualname": "CoLBERTConfig", "kind": "class", "doc": "

    \n"}, {"fullname": "debeir.models.colbert.CoLBERTConfig.__init__", "modulename": "debeir.models.colbert", "qualname": "CoLBERTConfig.__init__", "kind": "function", "doc": "

    \n", "signature": "(**kwargs)"}, {"fullname": "debeir.models.colbert.CoLBERTConfig.save", "modulename": "debeir.models.colbert", "qualname": "CoLBERTConfig.save", "kind": "function", "doc": "
    Parameters
    \n\n
      \n
    • fname: file name
    • \n
    • path: Path to save
    • \n
    \n", "signature": "(self, path, fname='colbert_config.json'):", "funcdef": "def"}, {"fullname": "debeir.models.colbert.CoLBERTConfig.load", "modulename": "debeir.models.colbert", "qualname": "CoLBERTConfig.load", "kind": "function", "doc": "

    Load the ColBERT config from path (don't point to file name just directory)

    \n\n
    Returns
    \n", "signature": "(cls, path, fname='colbert_config.json'):", "funcdef": "def"}, {"fullname": "debeir.models.colbert.ConvolutionalBlock", "modulename": "debeir.models.colbert", "qualname": "ConvolutionalBlock", "kind": "class", "doc": "

    Base class for all neural network modules.

    \n\n

    Your models should also subclass this class.

    \n\n

    Modules can also contain other Modules, allowing to nest them in\na tree structure. You can assign the submodules as regular attributes::

    \n\n
    import torch.nn as nn\nimport torch.nn.functional as F\n\nclass Model(nn.Module):\n    def __init__(self):\n        super().__init__()\n        self.conv1 = nn.Conv2d(1, 20, 5)\n        self.conv2 = nn.Conv2d(20, 20, 5)\n\n    def forward(self, x):\n        x = F.relu(self.conv1(x))\n        return F.relu(self.conv2(x))\n
    \n\n

    Submodules assigned in this way will be registered, and will have their\nparameters converted too when you call to(), etc.

    \n\n
    \n\n

    As per the example above, an __init__() call to the parent class\nmust be made before assignment on the child.

    \n\n
    \n\n

    :ivar training: Boolean represents whether this module is in training or\n evaluation mode.\n:vartype training: bool

    \n", "bases": "torch.nn.modules.module.Module"}, {"fullname": "debeir.models.colbert.ConvolutionalBlock.__init__", "modulename": "debeir.models.colbert", "qualname": "ConvolutionalBlock.__init__", "kind": "function", "doc": "

    Initializes internal Module state, shared by both nn.Module and ScriptModule.

    \n", "signature": "(\tin_channels,\tout_channels,\tkernel_size=1,\tfirst_stride=1,\tact_func=<class 'torch.nn.modules.activation.ReLU'>)"}, {"fullname": "debeir.models.colbert.ConvolutionalBlock.forward", "modulename": "debeir.models.colbert", "qualname": "ConvolutionalBlock.forward", "kind": "function", "doc": "

    Defines the computation performed at every call.

    \n\n

    Should be overridden by all subclasses.

    \n\n
    \n\n

    Although the recipe for forward pass needs to be defined within\nthis function, one should call the Module instance afterwards\ninstead of this since the former takes care of running the\nregistered hooks while the latter silently ignores them.

    \n\n
    \n", "signature": "(self, x):", "funcdef": "def"}, {"fullname": "debeir.models.colbert.KMaxPool", "modulename": "debeir.models.colbert", "qualname": "KMaxPool", "kind": "class", "doc": "

    Base class for all neural network modules.

    \n\n

    Your models should also subclass this class.

    \n\n

    Modules can also contain other Modules, allowing to nest them in\na tree structure. You can assign the submodules as regular attributes::

    \n\n
    import torch.nn as nn\nimport torch.nn.functional as F\n\nclass Model(nn.Module):\n    def __init__(self):\n        super().__init__()\n        self.conv1 = nn.Conv2d(1, 20, 5)\n        self.conv2 = nn.Conv2d(20, 20, 5)\n\n    def forward(self, x):\n        x = F.relu(self.conv1(x))\n        return F.relu(self.conv2(x))\n
    \n\n

    Submodules assigned in this way will be registered, and will have their\nparameters converted too when you call to(), etc.

    \n\n
    \n\n

    As per the example above, an __init__() call to the parent class\nmust be made before assignment on the child.

    \n\n
    \n\n

    :ivar training: Boolean represents whether this module is in training or\n evaluation mode.\n:vartype training: bool

    \n", "bases": "torch.nn.modules.module.Module"}, {"fullname": "debeir.models.colbert.KMaxPool.__init__", "modulename": "debeir.models.colbert", "qualname": "KMaxPool.__init__", "kind": "function", "doc": "

    Initializes internal Module state, shared by both nn.Module and ScriptModule.

    \n", "signature": "(k=1)"}, {"fullname": "debeir.models.colbert.KMaxPool.forward", "modulename": "debeir.models.colbert", "qualname": "KMaxPool.forward", "kind": "function", "doc": "

    Defines the computation performed at every call.

    \n\n

    Should be overridden by all subclasses.

    \n\n
    \n\n

    Although the recipe for forward pass needs to be defined within\nthis function, one should call the Module instance afterwards\ninstead of this since the former takes care of running the\nregistered hooks while the latter silently ignores them.

    \n\n
    \n", "signature": "(self, x):", "funcdef": "def"}, {"fullname": "debeir.models.colbert.visualisation_dump", "modulename": "debeir.models.colbert", "qualname": "visualisation_dump", "kind": "function", "doc": "

    \n", "signature": "(argmax, input_tensors):", "funcdef": "def"}, {"fullname": "debeir.models.colbert.ResidualBlock", "modulename": "debeir.models.colbert", "qualname": "ResidualBlock", "kind": "class", "doc": "

    Base class for all neural network modules.

    \n\n

    Your models should also subclass this class.

    \n\n

    Modules can also contain other Modules, allowing to nest them in\na tree structure. You can assign the submodules as regular attributes::

    \n\n
    import torch.nn as nn\nimport torch.nn.functional as F\n\nclass Model(nn.Module):\n    def __init__(self):\n        super().__init__()\n        self.conv1 = nn.Conv2d(1, 20, 5)\n        self.conv2 = nn.Conv2d(20, 20, 5)\n\n    def forward(self, x):\n        x = F.relu(self.conv1(x))\n        return F.relu(self.conv2(x))\n
    \n\n

    Submodules assigned in this way will be registered, and will have their\nparameters converted too when you call to(), etc.

    \n\n
    \n\n

    As per the example above, an __init__() call to the parent class\nmust be made before assignment on the child.

    \n\n
    \n\n

    :ivar training: Boolean represents whether this module is in training or\n evaluation mode.\n:vartype training: bool

    \n", "bases": "torch.nn.modules.module.Module"}, {"fullname": "debeir.models.colbert.ResidualBlock.__init__", "modulename": "debeir.models.colbert", "qualname": "ResidualBlock.__init__", "kind": "function", "doc": "

    Initializes internal Module state, shared by both nn.Module and ScriptModule.

    \n", "signature": "(\tin_channels,\tout_channels,\toptional_shortcut=True,\tkernel_size=1,\tact_func=<class 'torch.nn.modules.activation.ReLU'>)"}, {"fullname": "debeir.models.colbert.ResidualBlock.forward", "modulename": "debeir.models.colbert", "qualname": "ResidualBlock.forward", "kind": "function", "doc": "

    Defines the computation performed at every call.

    \n\n

    Should be overridden by all subclasses.

    \n\n
    \n\n

    Although the recipe for forward pass needs to be defined within\nthis function, one should call the Module instance afterwards\ninstead of this since the former takes care of running the\nregistered hooks while the latter silently ignores them.

    \n\n
    \n", "signature": "(self, x):", "funcdef": "def"}, {"fullname": "debeir.models.colbert.ColBERT", "modulename": "debeir.models.colbert", "qualname": "ColBERT", "kind": "class", "doc": "

    Base class for all neural network modules.

    \n\n

    Your models should also subclass this class.

    \n\n

    Modules can also contain other Modules, allowing to nest them in\na tree structure. You can assign the submodules as regular attributes::

    \n\n
    import torch.nn as nn\nimport torch.nn.functional as F\n\nclass Model(nn.Module):\n    def __init__(self):\n        super().__init__()\n        self.conv1 = nn.Conv2d(1, 20, 5)\n        self.conv2 = nn.Conv2d(20, 20, 5)\n\n    def forward(self, x):\n        x = F.relu(self.conv1(x))\n        return F.relu(self.conv2(x))\n
    \n\n

    Submodules assigned in this way will be registered, and will have their\nparameters converted too when you call to(), etc.

    \n\n
    \n\n

    As per the example above, an __init__() call to the parent class\nmust be made before assignment on the child.

    \n\n
    \n\n

    :ivar training: Boolean represents whether this module is in training or\n evaluation mode.\n:vartype training: bool

    \n", "bases": "torch.nn.modules.module.Module"}, {"fullname": "debeir.models.colbert.ColBERT.__init__", "modulename": "debeir.models.colbert", "qualname": "ColBERT.__init__", "kind": "function", "doc": "

    Initializes internal Module state, shared by both nn.Module and ScriptModule.

    \n", "signature": "(\tbert_model_args,\tbert_model_kwargs,\tconfig: transformers.models.bert.configuration_bert.BertConfig,\tdevice: str,\tmax_seq_len: int = 128,\tk: int = 8,\toptional_shortcut: bool = True,\thidden_neurons: int = 2048,\tuse_batch_norms: bool = True,\tuse_trans_blocks: bool = False,\tresidual_kernel_size: int = 1,\tdropout_perc: float = 0.5,\tact_func='mish',\tloss_func='cross_entropy_loss',\t**kwargs)"}, {"fullname": "debeir.models.colbert.ColBERT.forward", "modulename": "debeir.models.colbert", "qualname": "ColBERT.forward", "kind": "function", "doc": "

    Defines the computation performed at every call.

    \n\n

    Should be overridden by all subclasses.

    \n\n
    \n\n

    Although the recipe for forward pass needs to be defined within\nthis function, one should call the Module instance afterwards\ninstead of this since the former takes care of running the\nregistered hooks while the latter silently ignores them.

    \n\n
    \n", "signature": "(self, *args, **kwargs):", "funcdef": "def"}, {"fullname": "debeir.models.colbert.ColBERT.from_config", "modulename": "debeir.models.colbert", "qualname": "ColBERT.from_config", "kind": "function", "doc": "

    \n", "signature": "(cls, *args, config_path):", "funcdef": "def"}, {"fullname": "debeir.models.colbert.ColBERT.from_pretrained", "modulename": "debeir.models.colbert", "qualname": "ColBERT.from_pretrained", "kind": "function", "doc": "

    \n", "signature": "(cls, output_dir, **kwargs):", "funcdef": "def"}, {"fullname": "debeir.models.colbert.ColBERT.save_pretrained", "modulename": "debeir.models.colbert", "qualname": "ColBERT.save_pretrained", "kind": "function", "doc": "

    \n", "signature": "(self, output_dir):", "funcdef": "def"}, {"fullname": "debeir.models.colbert.ComBERT", "modulename": "debeir.models.colbert", "qualname": "ComBERT", "kind": "class", "doc": "

    Base class for all neural network modules.

    \n\n

    Your models should also subclass this class.

    \n\n

    Modules can also contain other Modules, allowing to nest them in\na tree structure. You can assign the submodules as regular attributes::

    \n\n
    import torch.nn as nn\nimport torch.nn.functional as F\n\nclass Model(nn.Module):\n    def __init__(self):\n        super().__init__()\n        self.conv1 = nn.Conv2d(1, 20, 5)\n        self.conv2 = nn.Conv2d(20, 20, 5)\n\n    def forward(self, x):\n        x = F.relu(self.conv1(x))\n        return F.relu(self.conv2(x))\n
    \n\n

    Submodules assigned in this way will be registered, and will have their\nparameters converted too when you call to(), etc.

    \n\n
    \n\n

    As per the example above, an __init__() call to the parent class\nmust be made before assignment on the child.

    \n\n
    \n\n

    :ivar training: Boolean represents whether this module is in training or\n evaluation mode.\n:vartype training: bool

    \n", "bases": "torch.nn.modules.module.Module"}, {"fullname": "debeir.models.colbert.ComBERT.__init__", "modulename": "debeir.models.colbert", "qualname": "ComBERT.__init__", "kind": "function", "doc": "

    Initializes internal Module state, shared by both nn.Module and ScriptModule.

    \n", "signature": "(\tbert_model_args,\tbert_model_kwargs,\tconfig: transformers.models.bert.configuration_bert.BertConfig,\tdevice: str,\tmax_seq_len: int = 128,\tk: int = 8,\toptional_shortcut: bool = True,\thidden_neurons: int = 2048,\tuse_batch_norms: bool = True,\tuse_trans_blocks: bool = False,\tresidual_kernel_size: int = 1,\tdropout_perc: float = 0.5,\tact_func='mish',\tloss_func='cross_entropy_loss',\tnum_blocks=2,\t**kwargs)"}, {"fullname": "debeir.models.colbert.ComBERT.forward", "modulename": "debeir.models.colbert", "qualname": "ComBERT.forward", "kind": "function", "doc": "

    Defines the computation performed at every call.

    \n\n

    Should be overridden by all subclasses.

    \n\n
    \n\n

    Although the recipe for forward pass needs to be defined within\nthis function, one should call the Module instance afterwards\ninstead of this since the former takes care of running the\nregistered hooks while the latter silently ignores them.

    \n\n
    \n", "signature": "(self, *args, **kwargs):", "funcdef": "def"}, {"fullname": "debeir.models.colbert.ComBERT.from_config", "modulename": "debeir.models.colbert", "qualname": "ComBERT.from_config", "kind": "function", "doc": "

    \n", "signature": "(cls, *args, config_path):", "funcdef": "def"}, {"fullname": "debeir.models.colbert.ComBERT.from_pretrained", "modulename": "debeir.models.colbert", "qualname": "ComBERT.from_pretrained", "kind": "function", "doc": "

    \n", "signature": "(cls, output_dir, **kwargs):", "funcdef": "def"}, {"fullname": "debeir.models.colbert.ComBERT.save_pretrained", "modulename": "debeir.models.colbert", "qualname": "ComBERT.save_pretrained", "kind": "function", "doc": "

    \n", "signature": "(self, output_dir):", "funcdef": "def"}, {"fullname": "debeir.rankers", "modulename": "debeir.rankers", "kind": "module", "doc": "

    Rankers module.

    \n\n
    \n

    Includes runnable out-of-box training code\n Custom ranking loss functions (e.g. LambdaLoss, NDCGLoss)\n Includes custom rankers for reranking or NIR-style queries.

    \n
    \n"}, {"fullname": "debeir.rankers.reranking", "modulename": "debeir.rankers.reranking", "kind": "module", "doc": "

    \n"}, {"fullname": "debeir.rankers.reranking.nir", "modulename": "debeir.rankers.reranking.nir", "kind": "module", "doc": "

    NIR Reranker

    \n\n

    [Insert paper link here]

    \n"}, {"fullname": "debeir.rankers.reranking.nir.NIReRanker", "modulename": "debeir.rankers.reranking.nir", "qualname": "NIReRanker", "kind": "class", "doc": "

    Re-ranker which uses the NIR scoring method\n score = log(bm25)/log(z) + cosine_sum

    \n", "bases": "debeir.rankers.reranking.reranker.DocumentReRanker"}, {"fullname": "debeir.rankers.reranking.nir.NIReRanker.__init__", "modulename": "debeir.rankers.reranking.nir", "qualname": "NIReRanker.__init__", "kind": "function", "doc": "

    \n", "signature": "(\tquery,\tranked_list: List[debeir.core.document.Document],\tencoder: debeir.rankers.transformer_sent_encoder.Encoder,\tdistance_fn=<function cosine>,\tfacets_weights: Dict = None,\tpresort=False,\tfields_to_encode=None,\t*args,\t**kwargs)"}, {"fullname": "debeir.rankers.reranking.reranker", "modulename": "debeir.rankers.reranking.reranker", "kind": "module", "doc": "

    General re-ranking interfaces to be implemented by child classes.

    \n"}, {"fullname": "debeir.rankers.reranking.reranker.ReRanker", "modulename": "debeir.rankers.reranking.reranker", "qualname": "ReRanker", "kind": "class", "doc": "

    General interface for a reranking.

    \n\n

    Child classes should implement the abstract methods.

    \n"}, {"fullname": "debeir.rankers.reranking.reranker.ReRanker.__init__", "modulename": "debeir.rankers.reranking.reranker", "qualname": "ReRanker.__init__", "kind": "function", "doc": "

    \n", "signature": "(query, ranked_list: List, *args, **kwargs)"}, {"fullname": "debeir.rankers.reranking.reranker.ReRanker.rerank", "modulename": "debeir.rankers.reranking.reranker", "qualname": "ReRanker.rerank", "kind": "function", "doc": "

    Re-rank the passed ranked list based on implemented private _compute_scores method.

    \n\n
    Parameters
    \n\n
      \n
    • ranked_list:
    • \n
    \n\n
    Returns
    \n\n
    \n
    A ranked list in descending order of the score field (which will be the last item in the list)\n
    \n
    \n", "signature": "(self) -> List:", "funcdef": "def"}, {"fullname": "debeir.rankers.reranking.reranker.DocumentReRanker", "modulename": "debeir.rankers.reranking.reranker", "qualname": "DocumentReRanker", "kind": "class", "doc": "

    Reranking interface for a ranked list of Document objects.

    \n", "bases": "ReRanker"}, {"fullname": "debeir.rankers.reranking.reranker.DocumentReRanker.__init__", "modulename": "debeir.rankers.reranking.reranker", "qualname": "DocumentReRanker.__init__", "kind": "function", "doc": "

    \n", "signature": "(\tquery,\tranked_list: List[debeir.core.document.Document],\t*args,\t**kwargs)"}, {"fullname": "debeir.rankers.reranking.reranker.ReRankerPool", "modulename": "debeir.rankers.reranking.reranker", "qualname": "ReRankerPool", "kind": "class", "doc": "

    \n"}, {"fullname": "debeir.rankers.reranking.reranker.ReRankerPool.__init__", "modulename": "debeir.rankers.reranking.reranker", "qualname": "ReRankerPool.__init__", "kind": "function", "doc": "

    \n", "signature": "()"}, {"fullname": "debeir.rankers.reranking.use", "modulename": "debeir.rankers.reranking.use", "kind": "module", "doc": "

    \n"}, {"fullname": "debeir.rankers.reranking.use.USEReRanker", "modulename": "debeir.rankers.reranking.use", "qualname": "USEReRanker", "kind": "class", "doc": "

    Re-ranks based on the cosine_sum rather the complete NIR scoring

    \n", "bases": "debeir.rankers.reranking.nir.NIReRanker"}, {"fullname": "debeir.rankers.reranking.use.USEReRanker.__init__", "modulename": "debeir.rankers.reranking.use", "qualname": "USEReRanker.__init__", "kind": "function", "doc": "

    \n", "signature": "(*args, **kwargs)"}, {"fullname": "debeir.rankers.transformer_sent_encoder", "modulename": "debeir.rankers.transformer_sent_encoder", "kind": "module", "doc": "

    \n"}, {"fullname": "debeir.rankers.transformer_sent_encoder.Encoder", "modulename": "debeir.rankers.transformer_sent_encoder", "qualname": "Encoder", "kind": "class", "doc": "

    A wrapper for the Sentence Transformer Encoder used in Universal Sentence Embeddings (USE) for ranking or reranking.

    \n\n
    Parameters
    \n\n
      \n
    • model_path: The path to a sentence transformer or transformer model.
    • \n
    • normalize: Normalize the output vectors to unit length for dot product retrieval rather than cosine.
    • \n
    • spacy_model: the spacy or scispacy model to use for sentence boundary detection.
    • \n
    • max_length: Maximum input length for the spacy nlp model.
    • \n
    \n"}, {"fullname": "debeir.rankers.transformer_sent_encoder.Encoder.__init__", "modulename": "debeir.rankers.transformer_sent_encoder", "qualname": "Encoder.__init__", "kind": "function", "doc": "

    \n", "signature": "(\tmodel_path,\tnormalize=False,\tspacy_model='en_core_sci_md',\tmax_length=2000000)"}, {"fullname": "debeir.rankers.transformer_sent_encoder.Encoder.encode", "modulename": "debeir.rankers.transformer_sent_encoder", "qualname": "Encoder.encode", "kind": "function", "doc": "

    Computes sentence embeddings for a given topic, uses spacy for sentence segmentation.\nBy default, uses a cache to store previously computed vectors. Pass \"disable_cache\" as a kwarg to disable this.

    \n\n
    Parameters
    \n\n
      \n
    • topic: The topic (a list of sentences) to encode. Should be a raw string.
    • \n
    • disable_cache: keyword argument, pass as True to disable encoding caching.
    • \n
    \n\n
    Returns
    \n\n
    \n
    Returns a list of encoded tensors is returned.\n
    \n
    \n", "signature": "(self, topic: str) -> List:", "funcdef": "def"}, {"fullname": "debeir.training", "modulename": "debeir.training", "kind": "module", "doc": "

    Runnable out-of-the-box code for training re-rankers.

    \n"}, {"fullname": "debeir.training.evaluate_reranker", "modulename": "debeir.training.evaluate_reranker", "kind": "module", "doc": "

    \n"}, {"fullname": "debeir.training.evaluate_reranker.SentenceEvaluator", "modulename": "debeir.training.evaluate_reranker", "qualname": "SentenceEvaluator", "kind": "class", "doc": "

    Evaluation class for computing metrics from TREC-style files

    \n", "bases": "debeir.evaluation.evaluator.Evaluator"}, {"fullname": "debeir.training.evaluate_reranker.SentenceEvaluator.__init__", "modulename": "debeir.training.evaluate_reranker", "qualname": "SentenceEvaluator.__init__", "kind": "function", "doc": "

    \n", "signature": "(\tmodel: debeir.rankers.transformer_sent_encoder.Encoder,\tdataset: datasets.arrow_dataset.Dataset,\tparsed_topics: Dict[Union[str, int], Dict],\ttext_cols: List[str],\tquery_cols: List[str],\tid_col: str,\tdistance_fn: str,\tqrels: str,\tmetrics: List[str])"}, {"fullname": "debeir.training.evaluate_reranker.SentenceEvaluator.produce_ranked_lists", "modulename": "debeir.training.evaluate_reranker", "qualname": "SentenceEvaluator.produce_ranked_lists", "kind": "function", "doc": "

    \n", "signature": "(self):", "funcdef": "def"}, {"fullname": "debeir.training.hparm_tuning", "modulename": "debeir.training.hparm_tuning", "kind": "module", "doc": "

    Hyper parameter tuning library using Optuna and Wandb

    \n"}, {"fullname": "debeir.training.hparm_tuning.config", "modulename": "debeir.training.hparm_tuning.config", "kind": "module", "doc": "

    \n"}, {"fullname": "debeir.training.hparm_tuning.config.HparamConfig", "modulename": "debeir.training.hparm_tuning.config", "qualname": "HparamConfig", "kind": "class", "doc": "

    Hyperparameter configuration file

    \n\n

    Expects a dictionary of hyperparameters

    \n\n

    hparams: Dict\n{\n \"learning_rate\": {\n \"type\": float\n \"low\": 0.1\n \"high\": 1.0\n \"step\": 0.1\n # OR\n args: [0.1, 1.0, 0.1]\n },\n}

    \n", "bases": "debeir.core.config.Config"}, {"fullname": "debeir.training.hparm_tuning.config.HparamConfig.__init__", "modulename": "debeir.training.hparm_tuning.config", "qualname": "HparamConfig.__init__", "kind": "function", "doc": "

    \n", "signature": "(hparams: Dict[str, Dict])"}, {"fullname": "debeir.training.hparm_tuning.config.HparamConfig.from_json", "modulename": "debeir.training.hparm_tuning.config", "qualname": "HparamConfig.from_json", "kind": "function", "doc": "

    \n", "signature": "(cls, fp) -> debeir.training.hparm_tuning.config.HparamConfig:", "funcdef": "def"}, {"fullname": "debeir.training.hparm_tuning.config.HparamConfig.validate", "modulename": "debeir.training.hparm_tuning.config", "qualname": "HparamConfig.validate", "kind": "function", "doc": "

    Validates if the config is correct.\nMust be implemented by inherited classes.

    \n", "signature": "(self):", "funcdef": "def"}, {"fullname": "debeir.training.hparm_tuning.config.HparamConfig.parse_config_to_py", "modulename": "debeir.training.hparm_tuning.config", "qualname": "HparamConfig.parse_config_to_py", "kind": "function", "doc": "

    Parses configuration file into usable python objects

    \n", "signature": "(self):", "funcdef": "def"}, {"fullname": "debeir.training.hparm_tuning.optuna_rank", "modulename": "debeir.training.hparm_tuning.optuna_rank", "kind": "module", "doc": "

    \n"}, {"fullname": "debeir.training.hparm_tuning.optuna_rank.objective", "modulename": "debeir.training.hparm_tuning.optuna_rank", "qualname": "objective", "kind": "function", "doc": "

    \n", "signature": "(\ttrainer: debeir.training.hparm_tuning.trainer.Trainer,\ttrial: optuna.trial._trial.Trial):", "funcdef": "def"}, {"fullname": "debeir.training.hparm_tuning.optuna_rank.run_optuna_with_wandb", "modulename": "debeir.training.hparm_tuning.optuna_rank", "qualname": "run_optuna_with_wandb", "kind": "function", "doc": "

    Partially initialize the objective function with a trainer and hparams to optimize.

    \n\n

    Optimize using the optuna library.

    \n\n
    Parameters
    \n\n
      \n
    • trainer:
    • \n
    • n_trials:
    • \n
    • maximize_objective:
    • \n
    • wandb_kwargs:
    • \n
    \n\n
    Returns
    \n", "signature": "(\ttrainer,\tn_trials=100,\tn_jobs=1,\tmaximize_objective=True,\tsave_study_path='.',\twandb_kwargs=None):", "funcdef": "def"}, {"fullname": "debeir.training.hparm_tuning.optuna_rank.print_optuna_stats", "modulename": "debeir.training.hparm_tuning.optuna_rank", "qualname": "print_optuna_stats", "kind": "function", "doc": "

    \n", "signature": "(study: optuna.study.study.Study):", "funcdef": "def"}, {"fullname": "debeir.training.hparm_tuning.trainer", "modulename": "debeir.training.hparm_tuning.trainer", "kind": "module", "doc": "

    \n"}, {"fullname": "debeir.training.hparm_tuning.trainer.OptimizersWrapper", "modulename": "debeir.training.hparm_tuning.trainer", "qualname": "OptimizersWrapper", "kind": "class", "doc": "

    \n"}, {"fullname": "debeir.training.hparm_tuning.trainer.OptimizersWrapper.__init__", "modulename": "debeir.training.hparm_tuning.trainer", "qualname": "OptimizersWrapper.__init__", "kind": "function", "doc": "

    \n", "signature": "()"}, {"fullname": "debeir.training.hparm_tuning.trainer.Trainer", "modulename": "debeir.training.hparm_tuning.trainer", "qualname": "Trainer", "kind": "class", "doc": "

    Wrapper class for a trainer class.

    \n"}, {"fullname": "debeir.training.hparm_tuning.trainer.Trainer.__init__", "modulename": "debeir.training.hparm_tuning.trainer", "qualname": "Trainer.__init__", "kind": "function", "doc": "

    \n", "signature": "(model, evaluator_fn, dataset_loading_fn)"}, {"fullname": "debeir.training.hparm_tuning.trainer.Trainer.fit", "modulename": "debeir.training.hparm_tuning.trainer", "qualname": "Trainer.fit", "kind": "function", "doc": "

    \n", "signature": "(\tself,\tin_trial: optuna.trial._trial.Trial,\ttrain_dataset,\tval_dataset):", "funcdef": "def"}, {"fullname": "debeir.training.hparm_tuning.trainer.SentenceTransformerHparamTrainer", "modulename": "debeir.training.hparm_tuning.trainer", "qualname": "SentenceTransformerHparamTrainer", "kind": "class", "doc": "

    See Optuna documentation for types!

    \n", "bases": "Trainer"}, {"fullname": "debeir.training.hparm_tuning.trainer.SentenceTransformerHparamTrainer.__init__", "modulename": "debeir.training.hparm_tuning.trainer", "qualname": "SentenceTransformerHparamTrainer.__init__", "kind": "function", "doc": "

    \n", "signature": "(\tdataset_loading_fn,\tevaluator_fn,\thparams_config: debeir.training.hparm_tuning.config.HparamConfig)"}, {"fullname": "debeir.training.hparm_tuning.trainer.SentenceTransformerHparamTrainer.get_optuna_hparams", "modulename": "debeir.training.hparm_tuning.trainer", "qualname": "SentenceTransformerHparamTrainer.get_optuna_hparams", "kind": "function", "doc": "

    Get hyperparameters suggested by the optuna library

    \n\n
    Parameters
    \n\n
      \n
    • trial: The optuna trial object
    • \n
    • hparams: Optional, pass a dictionary of HparamType[Enum] objects
    • \n
    \n\n
    Returns
    \n", "signature": "(\tself,\ttrial: optuna.trial._trial.Trial,\thparams: Sequence[debeir.training.hparm_tuning.types.Hparam] = None):", "funcdef": "def"}, {"fullname": "debeir.training.hparm_tuning.trainer.SentenceTransformerHparamTrainer.build_kwargs_and_model", "modulename": "debeir.training.hparm_tuning.trainer", "qualname": "SentenceTransformerHparamTrainer.build_kwargs_and_model", "kind": "function", "doc": "

    \n", "signature": "(self, hparams: Dict):", "funcdef": "def"}, {"fullname": "debeir.training.hparm_tuning.trainer.SentenceTransformerHparamTrainer.fit", "modulename": "debeir.training.hparm_tuning.trainer", "qualname": "SentenceTransformerHparamTrainer.fit", "kind": "function", "doc": "

    \n", "signature": "(\tself,\tin_trial: optuna.trial._trial.Trial,\ttrain_dataset,\tval_dataset):", "funcdef": "def"}, {"fullname": "debeir.training.hparm_tuning.trainer.trial_callback", "modulename": "debeir.training.hparm_tuning.trainer", "qualname": "trial_callback", "kind": "function", "doc": "

    \n", "signature": "(trial, score, epoch, *args, **kwargs):", "funcdef": "def"}, {"fullname": "debeir.training.hparm_tuning.trainer.SentenceTransformerTrainer", "modulename": "debeir.training.hparm_tuning.trainer", "qualname": "SentenceTransformerTrainer", "kind": "class", "doc": "

    See Optuna documentation for types!

    \n", "bases": "SentenceTransformerHparamTrainer"}, {"fullname": "debeir.training.hparm_tuning.trainer.SentenceTransformerTrainer.__init__", "modulename": "debeir.training.hparm_tuning.trainer", "qualname": "SentenceTransformerTrainer.__init__", "kind": "function", "doc": "

    \n", "signature": "(\tdataset: Union[datasets.dataset_dict.DatasetDict, Dict[str, datasets.arrow_dataset.Dataset]],\thparams_config: debeir.training.hparm_tuning.config.HparamConfig,\tevaluator_fn=None,\tevaluator=None,\tuse_wandb=False)"}, {"fullname": "debeir.training.hparm_tuning.trainer.SentenceTransformerTrainer.fit", "modulename": "debeir.training.hparm_tuning.trainer", "qualname": "SentenceTransformerTrainer.fit", "kind": "function", "doc": "

    \n", "signature": "(self, **extra_kwargs):", "funcdef": "def"}, {"fullname": "debeir.training.hparm_tuning.types", "modulename": "debeir.training.hparm_tuning.types", "kind": "module", "doc": "

    \n"}, {"fullname": "debeir.training.hparm_tuning.types.Hparam", "modulename": "debeir.training.hparm_tuning.types", "qualname": "Hparam", "kind": "class", "doc": "

    \n"}, {"fullname": "debeir.training.hparm_tuning.types.Hparam.__init__", "modulename": "debeir.training.hparm_tuning.types", "qualname": "Hparam.__init__", "kind": "function", "doc": "

    \n", "signature": "()"}, {"fullname": "debeir.training.hparm_tuning.types.Hparam.suggest", "modulename": "debeir.training.hparm_tuning.types", "qualname": "Hparam.suggest", "kind": "function", "doc": "

    \n", "signature": "(self, *args, **kwargs):", "funcdef": "def"}, {"fullname": "debeir.training.hparm_tuning.types.HparamFloat", "modulename": "debeir.training.hparm_tuning.types", "qualname": "HparamFloat", "kind": "class", "doc": "

    \n", "bases": "Hparam"}, {"fullname": "debeir.training.hparm_tuning.types.HparamFloat.__init__", "modulename": "debeir.training.hparm_tuning.types", "qualname": "HparamFloat.__init__", "kind": "function", "doc": "

    \n", "signature": "(\tname: str,\tlow: float,\thigh: float,\tlog: bool = False,\tstep: float = None)"}, {"fullname": "debeir.training.hparm_tuning.types.HparamFloat.suggest", "modulename": "debeir.training.hparm_tuning.types", "qualname": "HparamFloat.suggest", "kind": "function", "doc": "

    \n", "signature": "(self, trial: optuna.trial._trial.Trial):", "funcdef": "def"}, {"fullname": "debeir.training.hparm_tuning.types.HparamInt", "modulename": "debeir.training.hparm_tuning.types", "qualname": "HparamInt", "kind": "class", "doc": "

    \n", "bases": "Hparam"}, {"fullname": "debeir.training.hparm_tuning.types.HparamInt.__init__", "modulename": "debeir.training.hparm_tuning.types", "qualname": "HparamInt.__init__", "kind": "function", "doc": "

    \n", "signature": "(name: str, low: int, high: int, log: bool = False, step: int = 1)"}, {"fullname": "debeir.training.hparm_tuning.types.HparamInt.suggest", "modulename": "debeir.training.hparm_tuning.types", "qualname": "HparamInt.suggest", "kind": "function", "doc": "

    \n", "signature": "(self, trial: optuna.trial._trial.Trial):", "funcdef": "def"}, {"fullname": "debeir.training.hparm_tuning.types.HparamCategorical", "modulename": "debeir.training.hparm_tuning.types", "qualname": "HparamCategorical", "kind": "class", "doc": "

    \n", "bases": "Hparam"}, {"fullname": "debeir.training.hparm_tuning.types.HparamCategorical.__init__", "modulename": "debeir.training.hparm_tuning.types", "qualname": "HparamCategorical.__init__", "kind": "function", "doc": "

    \n", "signature": "(name: str, choices: Sequence, func: str = 'suggest_categorical')"}, {"fullname": "debeir.training.hparm_tuning.types.HparamCategorical.suggest", "modulename": "debeir.training.hparm_tuning.types", "qualname": "HparamCategorical.suggest", "kind": "function", "doc": "

    \n", "signature": "(self, trial: optuna.trial._trial.Trial):", "funcdef": "def"}, {"fullname": "debeir.training.hparm_tuning.types.HparamUniform", "modulename": "debeir.training.hparm_tuning.types", "qualname": "HparamUniform", "kind": "class", "doc": "

    \n", "bases": "Hparam"}, {"fullname": "debeir.training.hparm_tuning.types.HparamUniform.__init__", "modulename": "debeir.training.hparm_tuning.types", "qualname": "HparamUniform.__init__", "kind": "function", "doc": "

    \n", "signature": "(name: str, low: float, high: float, func: str = 'suggest_uniform')"}, {"fullname": "debeir.training.hparm_tuning.types.HparamUniform.suggest", "modulename": "debeir.training.hparm_tuning.types", "qualname": "HparamUniform.suggest", "kind": "function", "doc": "

    \n", "signature": "(self, trial: optuna.trial._trial.Trial):", "funcdef": "def"}, {"fullname": "debeir.training.hparm_tuning.types.HparamLogUniform", "modulename": "debeir.training.hparm_tuning.types", "qualname": "HparamLogUniform", "kind": "class", "doc": "

    \n", "bases": "Hparam"}, {"fullname": "debeir.training.hparm_tuning.types.HparamLogUniform.__init__", "modulename": "debeir.training.hparm_tuning.types", "qualname": "HparamLogUniform.__init__", "kind": "function", "doc": "

    \n", "signature": "(name: str, low: float, high: float, func: str = 'suggest_loguniform')"}, {"fullname": "debeir.training.hparm_tuning.types.HparamLogUniform.suggest", "modulename": "debeir.training.hparm_tuning.types", "qualname": "HparamLogUniform.suggest", "kind": "function", "doc": "

    \n", "signature": "(self, trial: optuna.trial._trial.Trial):", "funcdef": "def"}, {"fullname": "debeir.training.hparm_tuning.types.HparamDiscreteUniform", "modulename": "debeir.training.hparm_tuning.types", "qualname": "HparamDiscreteUniform", "kind": "class", "doc": "

    \n", "bases": "Hparam"}, {"fullname": "debeir.training.hparm_tuning.types.HparamDiscreteUniform.__init__", "modulename": "debeir.training.hparm_tuning.types", "qualname": "HparamDiscreteUniform.__init__", "kind": "function", "doc": "

    \n", "signature": "(\tname: str,\tlow: float,\thigh: float,\tq: float,\tfunc: str = 'suggest_discrete_uniform')"}, {"fullname": "debeir.training.hparm_tuning.types.HparamDiscreteUniform.suggest", "modulename": "debeir.training.hparm_tuning.types", "qualname": "HparamDiscreteUniform.suggest", "kind": "function", "doc": "

    \n", "signature": "(self, trial: optuna.trial._trial.Trial):", "funcdef": "def"}, {"fullname": "debeir.training.losses", "modulename": "debeir.training.losses", "kind": "module", "doc": "

    \n"}, {"fullname": "debeir.training.losses.contrastive", "modulename": "debeir.training.losses.contrastive", "kind": "module", "doc": "

    Author: Yonglong Tian (yonglong@mit.edu)\nDate: May 07, 2020

    \n\n

    Code imported from: https://github.com/HobbitLong/SupContrast/blob/master/losses.py

    \n"}, {"fullname": "debeir.training.losses.contrastive.SupConLoss", "modulename": "debeir.training.losses.contrastive", "qualname": "SupConLoss", "kind": "class", "doc": "

    Supervised Contrastive Learning: https://arxiv.org/pdf/2004.11362.pdf.\nIt also supports the unsupervised contrastive loss in SimCLR

    \n", "bases": "torch.nn.modules.module.Module"}, {"fullname": "debeir.training.losses.contrastive.SupConLoss.__init__", "modulename": "debeir.training.losses.contrastive", "qualname": "SupConLoss.__init__", "kind": "function", "doc": "

    Initializes internal Module state, shared by both nn.Module and ScriptModule.

    \n", "signature": "(temperature=1.0, contrast_mode='all', base_temperature=1.0)"}, {"fullname": "debeir.training.losses.contrastive.SupConLoss.forward", "modulename": "debeir.training.losses.contrastive", "qualname": "SupConLoss.forward", "kind": "function", "doc": "

    Compute loss for model. If both labels and mask are None,\nit degenerates to SimCLR unsupervised loss:\nhttps://arxiv.org/pdf/2002.05709.pdf\nArgs:\n features: hidden vector of shape [bsz, n_views, ...].\n labels: ground truth of shape [bsz].\n mask: contrastive mask of shape [bsz, bsz], mask_{i,j}=1 if sample j\n has the same class as sample i. Can be asymmetric.\nReturns:\n A loss scalar.

    \n", "signature": "(self, features, labels=None, mask=None):", "funcdef": "def"}, {"fullname": "debeir.training.losses.contrastive.SiameseDistanceMetric", "modulename": "debeir.training.losses.contrastive", "qualname": "SiameseDistanceMetric", "kind": "class", "doc": "

    The metric for the contrastive loss

    \n", "bases": "enum.Enum"}, {"fullname": "debeir.training.losses.contrastive.SiameseDistanceMetric.EUCLIDEAN", "modulename": "debeir.training.losses.contrastive", "qualname": "SiameseDistanceMetric.EUCLIDEAN", "kind": "function", "doc": "

    \n", "signature": "(x, y):", "funcdef": "def"}, {"fullname": "debeir.training.losses.contrastive.SiameseDistanceMetric.MANHATTAN", "modulename": "debeir.training.losses.contrastive", "qualname": "SiameseDistanceMetric.MANHATTAN", "kind": "function", "doc": "

    \n", "signature": "(x, y):", "funcdef": "def"}, {"fullname": "debeir.training.losses.contrastive.SiameseDistanceMetric.COSINE_DISTANCE", "modulename": "debeir.training.losses.contrastive", "qualname": "SiameseDistanceMetric.COSINE_DISTANCE", "kind": "function", "doc": "

    \n", "signature": "(x, y):", "funcdef": "def"}, {"fullname": "debeir.training.losses.contrastive.ContrastiveSentLoss", "modulename": "debeir.training.losses.contrastive", "qualname": "ContrastiveSentLoss", "kind": "class", "doc": "

    Contrastive loss. Expects as input two texts and a label of either 0 or 1. If the label == 1, then the distance between the\ntwo embeddings is reduced. If the label == 0, then the distance between the embeddings is increased.\nFurther information: http://yann.lecun.com/exdb/publis/pdf/hadsell-chopra-lecun-06.pdf

    \n\n
    Parameters
    \n\n
      \n
    • model: SentenceTransformer model
    • \n
    • distance_metric: Function that returns a distance between two emeddings. The class SiameseDistanceMetric contains pre-defined metrices that can be used
    • \n
    • margin: Negative samples (label == 0) should have a distance of at least the margin value.
    • \n
    • size_average: Average by the size of the mini-batch.\nExample::\nfrom sentence_transformers import SentenceTransformer, LoggingHandler, losses, InputExample\nfrom torch.utils.data import DataLoader\nmodel = SentenceTransformer('all-MiniLM-L6-v2')\ntrain_examples = [\n InputExample(texts=['This is a positive pair', 'Where the distance will be minimized'], label=1),\n InputExample(texts=['This is a negative pair', 'Their distance will be increased'], label=0)]\ntrain_dataloader = DataLoader(train_examples, shuffle=True, batch_size=2)\ntrain_loss = losses.ContrastiveLoss(model=model)\nmodel.fit([(train_dataloader, train_loss)], show_progress_bar=True)
    • \n
    \n", "bases": "torch.nn.modules.module.Module"}, {"fullname": "debeir.training.losses.contrastive.ContrastiveSentLoss.__init__", "modulename": "debeir.training.losses.contrastive", "qualname": "ContrastiveSentLoss.__init__", "kind": "function", "doc": "

    Initializes internal Module state, shared by both nn.Module and ScriptModule.

    \n", "signature": "(\tmodel,\tdistance_metric=<function SiameseDistanceMetric.<lambda>>,\tmargin: float = 0.5,\tsize_average: bool = True)"}, {"fullname": "debeir.training.losses.contrastive.ContrastiveSentLoss.get_config_dict", "modulename": "debeir.training.losses.contrastive", "qualname": "ContrastiveSentLoss.get_config_dict", "kind": "function", "doc": "

    \n", "signature": "(self):", "funcdef": "def"}, {"fullname": "debeir.training.losses.contrastive.ContrastiveSentLoss.forward", "modulename": "debeir.training.losses.contrastive", "qualname": "ContrastiveSentLoss.forward", "kind": "function", "doc": "

    Defines the computation performed at every call.

    \n\n

    Should be overridden by all subclasses.

    \n\n
    \n\n

    Although the recipe for forward pass needs to be defined within\nthis function, one should call the Module instance afterwards\ninstead of this since the former takes care of running the\nregistered hooks while the latter silently ignores them.

    \n\n
    \n", "signature": "(\tself,\tsentence_features: Iterable[Dict[str, torch.Tensor]],\tlabels: torch.Tensor):", "funcdef": "def"}, {"fullname": "debeir.training.losses.ranking", "modulename": "debeir.training.losses.ranking", "kind": "module", "doc": "

    Losses are drawn from the allrank library

    \n"}, {"fullname": "debeir.training.train_reranker", "modulename": "debeir.training.train_reranker", "kind": "module", "doc": "

    \n"}, {"fullname": "debeir.training.train_reranker.train_cross_encoder_reranker", "modulename": "debeir.training.train_reranker", "qualname": "train_cross_encoder_reranker", "kind": "function", "doc": "

    Trains a reranker with relevance signals

    \n\n
    Parameters
    \n\n
      \n
    • model_fp_or_name: The model name or path to the model
    • \n
    • output_dir: Output directory to save model, logs etc.
    • \n
    • train_dataset: Training Examples
    • \n
    • dev_dataset: Dev examples
    • \n
    • train_batch_size: Training batch size
    • \n
    • num_epochs: Number of epochs
    • \n
    • warmup_steps: Warmup steps for the scheduler
    • \n
    • evaluate_every_n_step: Evaluate the model every n steps
    • \n
    • special_tokens: Special tokens to add, defaults to [DOC], [QRY] tokens (bi-encoder)
    • \n
    • pooling_mode: Pooling mode for a sentence transformer model
    • \n
    • loss_func: Loss function(s) to use
    • \n
    • evaluator: Evaluator to use
    • \n
    \n", "signature": "(\tmodel_fp_or_name: str,\toutput_dir: str,\ttrain_dataset: List[debeir.datasets.types.RelevanceExample],\tdev_dataset: List[debeir.datasets.types.RelevanceExample],\ttrain_batch_size=32,\tnum_epochs=3,\twarmup_steps=None,\tevaluate_every_n_step: int = 1000,\tspecial_tokens=None,\tpooling_mode=None,\tloss_func=None,\tevaluator: sentence_transformers.evaluation.SentenceEvaluator.SentenceEvaluator = None,\t*args,\t**kwargs):", "funcdef": "def"}, {"fullname": "debeir.training.train_sentence_encoder", "modulename": "debeir.training.train_sentence_encoder", "kind": "module", "doc": "

    \n"}, {"fullname": "debeir.training.train_sentence_encoder.train_biencoder", "modulename": "debeir.training.train_sentence_encoder", "qualname": "train_biencoder", "kind": "function", "doc": "

    Train a universal sentence encoder

    \n\n
    Parameters
    \n\n
      \n
    • model_fp_or_name: The model name or path to the model
    • \n
    • output_dir: Output directory to save model, logs etc.
    • \n
    • train_examples: Training Examples
    • \n
    • dev_examples: Dev examples
    • \n
    • train_batch_size: Training batch size
    • \n
    • num_epochs: Number of epochs
    • \n
    • warmup_steps: Warmup steps for the scheduler
    • \n
    • evaluate_every_n_step: Evaluate the model every n steps
    • \n
    • special_tokens: Special tokens to add
    • \n
    • pooling_mode: Pooling mode for a sentence transformer model
    • \n
    • loss_func: Loss function(s) to use
    • \n
    • evaluator: Evaluator to use
    • \n
    \n", "signature": "(\tmodel_fp_or_name: str,\toutput_dir: str,\ttrain_examples: List[debeir.datasets.types.InputExample],\tdev_examples: List[debeir.datasets.types.InputExample],\ttrain_batch_size=32,\tnum_epochs=3,\twarmup_steps=None,\tevaluate_every_n_step: int = 1000,\tspecial_tokens=None,\tpooling_mode=None,\tloss_func=None,\tevaluator: sentence_transformers.evaluation.SentenceEvaluator.SentenceEvaluator = None,\t*args,\t**kwargs):", "funcdef": "def"}, {"fullname": "debeir.training.train_sentence_encoder.train_huggingface_transformer", "modulename": "debeir.training.train_sentence_encoder", "qualname": "train_huggingface_transformer", "kind": "function", "doc": "

    Train a transformer model using the Huggingface API

    \n\n
    Parameters
    \n\n
      \n
    • model_fp_or_name_or_cls: Model name or model class to instantiate
    • \n
    • tokenizer: Tokenizer
    • \n
    • output_dir: Output directory to write to
    • \n
    • compute_metric_fn: Metric function to compute metrics
    • \n
    • metric: Metric used by the compute_metric_fn
    • \n
    • dataset: Huggingface Dataset Dict
    • \n
    • train_dataset: Training dataset to be used by the Trainer class
    • \n
    • eval_dataset: Evaluation dataset to be used by the Trainer class
    • \n
    • train_batch_size: Batch size to use for training
    • \n
    • num_epochs: Number of training epochs (default: 3)
    • \n
    • learning_rate: Learning rate (default: 5e-5)
    • \n
    • lr_scheduler_type: Learning rate type, see SchedulerType
    • \n
    • optimizer: Optimizer
    • \n
    • warmup_ratio: Warmup ratios as ratio of steps (default 0.1)
    • \n
    • evaluate_every_n_step: Number of steps to evaluate
    • \n
    • pooling_mode: Pooling mode for your model
    • \n
    • loss_func: Loss function to instantiate model
    • \n
    • model_args: Model arguments to pass
    • \n
    • model_kwargs: Model keyword arguments
    • \n
    • padding_strategy: Tokenization padding strategy
    • \n
    • truncate: Truncate tokenization strategy
    • \n
    • special_tokens: Special tokens to add to the tokenizer
    • \n
    • seed: Dataset shuffle seed
    • \n
    • args:
    • \n
    • kwargs:
    • \n
    \n\n
    Returns
    \n", "signature": "(\tmodel_fp_or_name_or_cls: Union[str, transformers.modeling_utils.PreTrainedModel],\ttokenizer: transformers.tokenization_utils.PreTrainedTokenizer,\toutput_dir: str,\tcompute_metric_fn,\tmetric: datasets.metric.Metric,\tdataset: datasets.dataset_dict.DatasetDict = None,\ttrain_dataset: List[Union[debeir.datasets.types.RelevanceExample, debeir.datasets.types.InputExample, datasets.arrow_dataset.Dataset]] = None,\teval_dataset: List[Union[debeir.datasets.types.RelevanceExample, debeir.datasets.types.InputExample, datasets.arrow_dataset.Dataset]] = None,\ttrain_batch_size=32,\tnum_epochs=3,\tlearning_rate=5e-05,\tlr_scheduler_type: transformers.trainer_utils.SchedulerType = <SchedulerType.CONSTANT_WITH_WARMUP: 'constant_with_warmup'>,\toptimizer: str = 'adamw_hf',\twarmup_ratio=0.1,\tevaluate_every_n_step: int = 1000,\tpooling_mode=None,\tloss_func=None,\tmodel_args=None,\tmodel_kwargs=None,\tpadding_strategy='max_length',\ttruncate=True,\tspecial_tokens=None,\tseed=42,\t*args,\t**kwargs) -> transformers.trainer.Trainer:", "funcdef": "def"}, {"fullname": "debeir.training.utils", "modulename": "debeir.training.utils", "kind": "module", "doc": "

    \n"}, {"fullname": "debeir.training.utils.LoggingScheduler", "modulename": "debeir.training.utils", "qualname": "LoggingScheduler", "kind": "class", "doc": "

    \n"}, {"fullname": "debeir.training.utils.LoggingScheduler.__init__", "modulename": "debeir.training.utils", "qualname": "LoggingScheduler.__init__", "kind": "function", "doc": "

    \n", "signature": "(scheduler: torch.optim.lr_scheduler.LambdaLR)"}, {"fullname": "debeir.training.utils.LoggingScheduler.step", "modulename": "debeir.training.utils", "qualname": "LoggingScheduler.step", "kind": "function", "doc": "

    \n", "signature": "(self, epoch=None):", "funcdef": "def"}, {"fullname": "debeir.training.utils.get_scheduler_with_wandb", "modulename": "debeir.training.utils", "qualname": "get_scheduler_with_wandb", "kind": "function", "doc": "

    Returns the correct learning rate scheduler. Available scheduler: constantlr, warmupconstant, warmuplinear, warmupcosine, warmupcosinewithhardrestarts

    \n", "signature": "(optimizer, scheduler: str, warmup_steps: int, t_total: int):", "funcdef": "def"}, {"fullname": "debeir.training.utils.LoggingLoss", "modulename": "debeir.training.utils", "qualname": "LoggingLoss", "kind": "class", "doc": "

    \n"}, {"fullname": "debeir.training.utils.LoggingLoss.__init__", "modulename": "debeir.training.utils", "qualname": "LoggingLoss.__init__", "kind": "function", "doc": "

    \n", "signature": "(loss_fn)"}, {"fullname": "debeir.training.utils.TokenizerOverload", "modulename": "debeir.training.utils", "qualname": "TokenizerOverload", "kind": "class", "doc": "

    \n"}, {"fullname": "debeir.training.utils.TokenizerOverload.__init__", "modulename": "debeir.training.utils", "qualname": "TokenizerOverload.__init__", "kind": "function", "doc": "

    \n", "signature": "(tokenizer, tokenizer_kwargs, debug=False)"}, {"fullname": "debeir.training.utils.LoggingEvaluator", "modulename": "debeir.training.utils", "qualname": "LoggingEvaluator", "kind": "class", "doc": "

    \n"}, {"fullname": "debeir.training.utils.LoggingEvaluator.__init__", "modulename": "debeir.training.utils", "qualname": "LoggingEvaluator.__init__", "kind": "function", "doc": "

    \n", "signature": "(evaluator)"}, {"fullname": "debeir.training.utils.SentDataset", "modulename": "debeir.training.utils", "qualname": "SentDataset", "kind": "class", "doc": "

    \n"}, {"fullname": "debeir.training.utils.SentDataset.__init__", "modulename": "debeir.training.utils", "qualname": "SentDataset.__init__", "kind": "function", "doc": "

    \n", "signature": "(\tdataset: datasets.arrow_dataset.Dataset,\ttext_cols: List[str],\tlabel_col: str = None,\tlabel=None)"}, {"fullname": "debeir.training.utils.SentDatasetList", "modulename": "debeir.training.utils", "qualname": "SentDatasetList", "kind": "class", "doc": "

    \n"}, {"fullname": "debeir.training.utils.SentDatasetList.__init__", "modulename": "debeir.training.utils", "qualname": "SentDatasetList.__init__", "kind": "function", "doc": "

    \n", "signature": "(datasets: List[debeir.training.utils.SentDataset])"}, {"fullname": "debeir.training.utils.tokenize_function", "modulename": "debeir.training.utils", "qualname": "tokenize_function", "kind": "function", "doc": "

    Tokenizer function

    \n\n
    Parameters
    \n\n
      \n
    • tokenizer: Tokenizer
    • \n
    • examples: Input examples to tokenize
    • \n
    • padding_strategy: Padding strategy
    • \n
    • truncate: Truncate sentences
    • \n
    \n\n
    Returns
    \n\n
    \n
    Returns a list of tokenized examples\n
    \n
    \n", "signature": "(tokenizer, examples, padding_strategy, truncate):", "funcdef": "def"}, {"fullname": "debeir.training.utils.get_max_seq_length", "modulename": "debeir.training.utils", "qualname": "get_max_seq_length", "kind": "function", "doc": "

    \n", "signature": "(tokenizer, dataset, x_labels, dataset_key='train'):", "funcdef": "def"}, {"fullname": "debeir.utils", "modulename": "debeir.utils", "kind": "module", "doc": "

    Common utilities such as score normalization and creating output directory w/ checks

    \n"}, {"fullname": "debeir.utils.scaler", "modulename": "debeir.utils.scaler", "kind": "module", "doc": "

    \n"}, {"fullname": "debeir.utils.scaler.unpack_elasticsearch_scores", "modulename": "debeir.utils.scaler", "qualname": "unpack_elasticsearch_scores", "kind": "function", "doc": "

    Helper function to retrieve the top score of documents for each topic.\nUsed in NIR weight adjustment calculation.

    \n\n
    Parameters
    \n\n
      \n
    • results: Raw input of results from Elasticsearch library
    • \n
    \n\n
    Returns
    \n\n
    \n
    Returns a 1-D dictionary of {topic_num: top_score} pairs.\n
    \n
    \n", "signature": "(results) -> Dict:", "funcdef": "def"}, {"fullname": "debeir.utils.scaler.get_z_value", "modulename": "debeir.utils.scaler", "qualname": "get_z_value", "kind": "function", "doc": "

    Analytical solution for the normalization constant, z, used in NIR log normalization.

    \n\n
    Parameters
    \n\n
      \n
    • cosine_ceiling: The highest theoretical additive cosine score
    • \n
    • bm25_ceiling: The highest BM25 score retrieved from a given topic OR an estimate.
    • \n
    \n\n
    Returns
    \n\n
    \n
    The normalization parameter for NIR log normalization.\n
    \n
    \n", "signature": "(cosine_ceiling, bm25_ceiling) -> float:", "funcdef": "def"}, {"fullname": "debeir.utils.utils", "modulename": "debeir.utils.utils", "kind": "module", "doc": "

    \n"}, {"fullname": "debeir.utils.utils.create_output_file", "modulename": "debeir.utils.utils", "qualname": "create_output_file", "kind": "function", "doc": "

    Create output file based on config instructions

    \n\n
    Parameters
    \n\n
      \n
    • config: The config object with output file options.
    • \n
    • config_fp: The config file path used in default naming options for the output file.
    • \n
    • remove: Overwrites the output file if it exists
    • \n
    • output_file: The output file path if it exists
    • \n
    • output_directory: The output directory used for default naming (specified in nir config)
    • \n
    • kwargs: Compatibility arguments
    • \n
    \n\n
    Returns
    \n", "signature": "(config, config_fp, remove, output_file, output_directory, **kwargs):", "funcdef": "def"}, {"fullname": "debeir.utils.utils.unpack_coroutine", "modulename": "debeir.utils.utils", "qualname": "unpack_coroutine", "kind": "function", "doc": "

    Recursively unwraps co-routines until a result is reached.

    \n\n
    Parameters
    \n\n
      \n
    • f: Wrapped co-routine function.
    • \n
    \n\n
    Returns
    \n\n
    \n
    Results from the (final) evaluated co-routine.\n
    \n
    \n", "signature": "(f):", "funcdef": "async def"}, {"fullname": "debeir.utils.utils.flatten", "modulename": "debeir.utils.utils", "qualname": "flatten", "kind": "function", "doc": "

    Flattens a multidimensional dictionary (dictionary of dictionaries) to a single layer with child keys seperated by\n\"sep\"

    \n\n
    Parameters
    \n\n
      \n
    • d: Multi-level dictionary to flatten.
    • \n
    • parent_key: Prepend a parent_key to all layers.
    • \n
    • sep: Seperator token between child and parent layers.
    • \n
    \n\n
    Returns
    \n\n
    \n
    A flattened 1-D dictionary with keys seperated by *sep*.\n
    \n
    \n", "signature": "(d, parent_key='', sep='_'):", "funcdef": "def"}, {"fullname": "debeir.utils.utils.remove_excess_whitespace", "modulename": "debeir.utils.utils", "qualname": "remove_excess_whitespace", "kind": "function", "doc": "

    \n", "signature": "(s):", "funcdef": "def"}]; + /** pdoc search index */const docs = [{"fullname": "debeir", "modulename": "debeir", "kind": "module", "doc": "

    The DeBEIR (Dense Bi-Encoder Information Retrieval) source code library.

    \n\n

    See ./examples/ in the parent directory for an out-of-the-box runnable code.

    \n\n

    Otherwise, check out notebooks in the parent directory for training your own model amongst other things.

    \n"}, {"fullname": "debeir.core", "modulename": "debeir.core", "kind": "module", "doc": "

    Core library interfaces that must be implemented for custom datasets

    \n\n

    Interfaces to implement custom datasets in debeir.datasets.

    \n"}, {"fullname": "debeir.core.callbacks", "modulename": "debeir.core.callbacks", "kind": "module", "doc": "

    Callbacks for before after running.\nE.g. before is for setup\nafter is for evaluation/serialization etc

    \n"}, {"fullname": "debeir.core.callbacks.Callback", "modulename": "debeir.core.callbacks", "qualname": "Callback", "kind": "class", "doc": "

    \n"}, {"fullname": "debeir.core.callbacks.Callback.__init__", "modulename": "debeir.core.callbacks", "qualname": "Callback.__init__", "kind": "function", "doc": "

    \n", "signature": "()"}, {"fullname": "debeir.core.callbacks.Callback.before", "modulename": "debeir.core.callbacks", "qualname": "Callback.before", "kind": "function", "doc": "

    \n", "signature": "(self, pipeline: debeir.core.pipeline.Pipeline):", "funcdef": "def"}, {"fullname": "debeir.core.callbacks.Callback.after", "modulename": "debeir.core.callbacks", "qualname": "Callback.after", "kind": "function", "doc": "

    \n", "signature": "(self, results: List):", "funcdef": "def"}, {"fullname": "debeir.core.callbacks.SerializationCallback", "modulename": "debeir.core.callbacks", "qualname": "SerializationCallback", "kind": "class", "doc": "

    \n", "bases": "Callback"}, {"fullname": "debeir.core.callbacks.SerializationCallback.__init__", "modulename": "debeir.core.callbacks", "qualname": "SerializationCallback.__init__", "kind": "function", "doc": "

    \n", "signature": "(\tconfig: debeir.core.config.GenericConfig,\tnir_config: debeir.core.config.NIRConfig)"}, {"fullname": "debeir.core.callbacks.SerializationCallback.before", "modulename": "debeir.core.callbacks", "qualname": "SerializationCallback.before", "kind": "function", "doc": "

    Check if output file exists

    \n\n
    Returns
    \n\n
    \n
    Output file path\n
    \n
    \n", "signature": "(self, pipeline: debeir.core.pipeline.Pipeline):", "funcdef": "def"}, {"fullname": "debeir.core.callbacks.SerializationCallback.after", "modulename": "debeir.core.callbacks", "qualname": "SerializationCallback.after", "kind": "function", "doc": "

    Serialize results to self.output_file in a TREC-style format

    \n\n
    Parameters
    \n\n
      \n
    • topic_num: Topic number to serialize
    • \n
    • res: Raw elasticsearch result
    • \n
    • run_name: The run name for TREC-style runs (default: NO_RUN_NAME)
    • \n
    \n", "signature": "(self, results: List):", "funcdef": "def"}, {"fullname": "debeir.core.callbacks.EvaluationCallback", "modulename": "debeir.core.callbacks", "qualname": "EvaluationCallback", "kind": "class", "doc": "

    \n", "bases": "Callback"}, {"fullname": "debeir.core.callbacks.EvaluationCallback.__init__", "modulename": "debeir.core.callbacks", "qualname": "EvaluationCallback.__init__", "kind": "function", "doc": "

    \n", "signature": "(evaluator: debeir.evaluation.evaluator.Evaluator, config)"}, {"fullname": "debeir.core.callbacks.EvaluationCallback.before", "modulename": "debeir.core.callbacks", "qualname": "EvaluationCallback.before", "kind": "function", "doc": "

    \n", "signature": "(self, pipeline: debeir.core.pipeline.Pipeline):", "funcdef": "def"}, {"fullname": "debeir.core.callbacks.EvaluationCallback.after", "modulename": "debeir.core.callbacks", "qualname": "EvaluationCallback.after", "kind": "function", "doc": "

    \n", "signature": "(self, results: List, id_field='id'):", "funcdef": "def"}, {"fullname": "debeir.core.config", "modulename": "debeir.core.config", "kind": "module", "doc": "

    \n"}, {"fullname": "debeir.core.config.Config", "modulename": "debeir.core.config", "qualname": "Config", "kind": "class", "doc": "

    Config Interface with creation class methods

    \n"}, {"fullname": "debeir.core.config.Config.__init__", "modulename": "debeir.core.config", "qualname": "Config.__init__", "kind": "function", "doc": "

    \n", "signature": "()"}, {"fullname": "debeir.core.config.Config.from_toml", "modulename": "debeir.core.config", "qualname": "Config.from_toml", "kind": "function", "doc": "

    Instantiates a Config object from a toml file

    \n\n
    Parameters
    \n\n
      \n
    • fp: File path of the Config TOML file
    • \n
    • field_class: Class of the Config object to be instantiated
    • \n
    • args: Arguments to be passed to Config
    • \n
    • kwargs: Keyword arguments to be passed
    • \n
    \n\n
    Returns
    \n\n
    \n
    A instantiated and validated Config object.\n
    \n
    \n", "signature": "(\tcls,\tfp: Union[str, pathlib.Path],\tfield_class,\t*args,\t**kwargs) -> debeir.core.config.Config:", "funcdef": "def"}, {"fullname": "debeir.core.config.Config.from_args", "modulename": "debeir.core.config", "qualname": "Config.from_args", "kind": "function", "doc": "

    Instantiates a Config object from arguments

    \n\n
    Parameters
    \n\n
      \n
    • args_dict:
    • \n
    • field_class:
    • \n
    • args:
    • \n
    • kwargs:
    • \n
    \n\n
    Returns
    \n", "signature": "(cls, args_dict: MutableMapping, field_class, *args, **kwargs):", "funcdef": "def"}, {"fullname": "debeir.core.config.Config.from_dict", "modulename": "debeir.core.config", "qualname": "Config.from_dict", "kind": "function", "doc": "

    Instantiates a Config object from a dictionary

    \n\n
    Parameters
    \n\n
      \n
    • data_class:
    • \n
    • kwargs:
    • \n
    \n\n
    Returns
    \n", "signature": "(cls, data_class, **kwargs):", "funcdef": "def"}, {"fullname": "debeir.core.config.Config.validate", "modulename": "debeir.core.config", "qualname": "Config.validate", "kind": "function", "doc": "

    Validates if the config is correct.\nMust be implemented by inherited classes.

    \n", "signature": "(self):", "funcdef": "def"}, {"fullname": "debeir.core.config.GenericConfig", "modulename": "debeir.core.config", "qualname": "GenericConfig", "kind": "class", "doc": "

    Generic NIR Configuration file for which all configs will inherit

    \n", "bases": "Config, abc.ABC"}, {"fullname": "debeir.core.config.GenericConfig.__init__", "modulename": "debeir.core.config", "qualname": "GenericConfig.__init__", "kind": "function", "doc": "

    \n", "signature": "(\tquery_type: str,\tindex: str = None,\tencoder_normalize: bool = True,\tablations: bool = False,\tnorm_weight: float = None,\tautomatic: bool = None,\tencoder: object = None,\tencoder_fp: str = None,\tquery_weights: List[float] = None,\tcosine_weights: List[float] = None,\tevaluate: bool = False,\tqrels: str = None,\tconfig_fn: str = None,\tquery_fn: str = None,\tparser_fn: str = None,\texecutor_fn: str = None,\tcosine_ceiling: float = None,\ttopics_path: str = None,\treturn_id_only: bool = False,\toverwrite_output_if_exists: bool = False,\toutput_file: str = None,\trun_name: str = None)"}, {"fullname": "debeir.core.config.GenericConfig.from_toml", "modulename": "debeir.core.config", "qualname": "GenericConfig.from_toml", "kind": "function", "doc": "

    Instantiates a Config object from a toml file

    \n\n
    Parameters
    \n\n
      \n
    • fp: File path of the Config TOML file
    • \n
    • field_class: Class of the Config object to be instantiated
    • \n
    • args: Arguments to be passed to Config
    • \n
    • kwargs: Keyword arguments to be passed
    • \n
    \n\n
    Returns
    \n\n
    \n
    A instantiated and validated Config object.\n
    \n
    \n", "signature": "(\tcls,\tfp: Union[str, pathlib.Path],\t*args,\t**kwargs) -> debeir.core.config.GenericConfig:", "funcdef": "def"}, {"fullname": "debeir.core.config.ElasticsearchConfig", "modulename": "debeir.core.config", "qualname": "ElasticsearchConfig", "kind": "class", "doc": "

    Basic Elasticsearch configuration file settings from the master nir.toml file

    \n", "bases": "Config"}, {"fullname": "debeir.core.config.ElasticsearchConfig.__init__", "modulename": "debeir.core.config", "qualname": "ElasticsearchConfig.__init__", "kind": "function", "doc": "

    \n", "signature": "(protocol: str, ip: str, port: str, timeout: int)"}, {"fullname": "debeir.core.config.ElasticsearchConfig.validate", "modulename": "debeir.core.config", "qualname": "ElasticsearchConfig.validate", "kind": "function", "doc": "

    Checks if Elasticsearch URL is correct

    \n", "signature": "(self):", "funcdef": "def"}, {"fullname": "debeir.core.config.SolrConfig", "modulename": "debeir.core.config", "qualname": "SolrConfig", "kind": "class", "doc": "

    Basic Solr configuration file settings from the master nir.toml file

    \n", "bases": "ElasticsearchConfig"}, {"fullname": "debeir.core.config.SolrConfig.__init__", "modulename": "debeir.core.config", "qualname": "SolrConfig.__init__", "kind": "function", "doc": "

    \n", "signature": "(protocol: str, ip: str, port: str, timeout: int)"}, {"fullname": "debeir.core.config.MetricsConfig", "modulename": "debeir.core.config", "qualname": "MetricsConfig", "kind": "class", "doc": "

    Basic Metrics configuration file settings from the master nir.toml file

    \n", "bases": "Config"}, {"fullname": "debeir.core.config.MetricsConfig.__init__", "modulename": "debeir.core.config", "qualname": "MetricsConfig.__init__", "kind": "function", "doc": "

    \n", "signature": "(metrics: List[str])"}, {"fullname": "debeir.core.config.MetricsConfig.validate", "modulename": "debeir.core.config", "qualname": "MetricsConfig.validate", "kind": "function", "doc": "

    Checks if each Metrics is usable by evaluator classes

    \n", "signature": "(self):", "funcdef": "def"}, {"fullname": "debeir.core.config.NIRConfig", "modulename": "debeir.core.config", "qualname": "NIRConfig", "kind": "class", "doc": "

    Basic NIR configuration file settings from the master nir.toml file

    \n", "bases": "Config"}, {"fullname": "debeir.core.config.NIRConfig.__init__", "modulename": "debeir.core.config", "qualname": "NIRConfig.__init__", "kind": "function", "doc": "

    \n", "signature": "(\tnorm_weight: str,\tevaluate: bool,\treturn_size: int,\toutput_directory: str)"}, {"fullname": "debeir.core.config.NIRConfig.validate", "modulename": "debeir.core.config", "qualname": "NIRConfig.validate", "kind": "function", "doc": "

    Validates if the config is correct.\nMust be implemented by inherited classes.

    \n", "signature": "(self):", "funcdef": "def"}, {"fullname": "debeir.core.config.apply_config", "modulename": "debeir.core.config", "qualname": "apply_config", "kind": "function", "doc": "

    Configuration decorator.

    \n\n
    Parameters
    \n\n
      \n
    • func: Decorated function
    • \n
    \n\n
    Returns
    \n", "signature": "(func):", "funcdef": "def"}, {"fullname": "debeir.core.config.override_with_toml_config", "modulename": "debeir.core.config", "qualname": "override_with_toml_config", "kind": "function", "doc": "

    Configuration decorator. Overwrite a functions kwargs and args with a specified toml config file.\nPass override_with_config=path/to/config

    \n\n
    Parameters
    \n\n
      \n
    • func: Decorated function
    • \n
    \n\n
    Returns
    \n", "signature": "(func):", "funcdef": "def"}, {"fullname": "debeir.core.config.save_kwargs_to_file", "modulename": "debeir.core.config", "qualname": "save_kwargs_to_file", "kind": "function", "doc": "

    \n", "signature": "(func):", "funcdef": "def"}, {"fullname": "debeir.core.converters", "modulename": "debeir.core.converters", "kind": "module", "doc": "

    \n"}, {"fullname": "debeir.core.converters.ParsedTopicsToDataset", "modulename": "debeir.core.converters", "qualname": "ParsedTopicsToDataset", "kind": "class", "doc": "

    Converts a parser's output to a huggingface dataset object.

    \n"}, {"fullname": "debeir.core.converters.ParsedTopicsToDataset.__init__", "modulename": "debeir.core.converters", "qualname": "ParsedTopicsToDataset.__init__", "kind": "function", "doc": "

    \n", "signature": "()"}, {"fullname": "debeir.core.converters.ParsedTopicsToDataset.convert", "modulename": "debeir.core.converters", "qualname": "ParsedTopicsToDataset.convert", "kind": "function", "doc": "

    Flatten a Dict of shape (traditional parser output)\n{topic_id: {\n \"Facet_1\": ...\n \"Facet_2\": ...\n }\n}

    \n\n

    ->

    \n\n

    To a flattened arrow-like dataset.\n{\ntopic_ids: [],\nFacet_1s: [],\nFacet_2s: [],\n}

    \n\n
    Parameters
    \n\n
      \n
    • output: Topics output from the parser object
    • \n
    \n\n
    Returns
    \n", "signature": "(\tcls,\tparser: debeir.core.parser.Parser,\toutput: Dict[Union[str, int], Dict]):", "funcdef": "def"}, {"fullname": "debeir.core.document", "modulename": "debeir.core.document", "kind": "module", "doc": "

    \n"}, {"fullname": "debeir.core.document.Document", "modulename": "debeir.core.document", "qualname": "Document", "kind": "class", "doc": "

    Generic Document class.\nUsed as an interface for interacting across multiple indexes with different mappings.

    \n"}, {"fullname": "debeir.core.document.Document.__init__", "modulename": "debeir.core.document", "qualname": "Document.__init__", "kind": "function", "doc": "

    \n", "signature": "(\tdoc_id: Union[int, float, str],\ttopic_num: Union[int, str, float] = None,\tfacets: Dict = None,\tscore: Union[float, int] = 0.0,\tscores: Dict[str, Union[float, int]] = <factory>)"}, {"fullname": "debeir.core.document.Document.from_results", "modulename": "debeir.core.document", "qualname": "Document.from_results", "kind": "function", "doc": "

    Produces a list of Document objects from raw results returned from the index

    \n\n

    In the format {topic_num: [Document, ..., Document]}

    \n", "signature": "(\tcls,\tresults,\t*args,\t**kwargs) -> Dict[Union[int, float], debeir.core.document.Document]:", "funcdef": "def"}, {"fullname": "debeir.core.document.Document.get_document_id", "modulename": "debeir.core.document", "qualname": "Document.get_document_id", "kind": "function", "doc": "
    Returns
    \n\n
    \n
    self.doc_id\n
    \n
    \n", "signature": "(self):", "funcdef": "def"}, {"fullname": "debeir.core.document.Document.flatten_facets", "modulename": "debeir.core.document", "qualname": "Document.flatten_facets", "kind": "function", "doc": "

    Flattens multi-level internal document facets into a single level\n e.g. Doc['Upper']['Lower'] -> Doc['Upper_Lower']

    \n\n
    Parameters
    \n\n
      \n
    • args:
    • \n
    • kwargs:
    • \n
    \n", "signature": "(self, *args, **kwargs):", "funcdef": "def"}, {"fullname": "debeir.core.document.Document.get_document_facet", "modulename": "debeir.core.document", "qualname": "Document.get_document_facet", "kind": "function", "doc": "

    Retrieve a document facet\nWorks for multidimensional keys or single

    \n\n
    Parameters
    \n\n
      \n
    • key: Facet to retrieve
    • \n
    • sep: The seperator for multidimensional key
    • \n
    \n\n
    Returns
    \n\n
    \n
    Returns the document facet given the key (field)\n
    \n
    \n", "signature": "(self, key, sep='_'):", "funcdef": "def"}, {"fullname": "debeir.core.document.Document.set", "modulename": "debeir.core.document", "qualname": "Document.set", "kind": "function", "doc": "

    Set attributes of the object. Use keyword arguments to do so. Works as a builder class.\ndoc.set(doc_id=\"123\").set(facets={\"title\": \"my title\"})

    \n\n
    Parameters
    \n\n
      \n
    • doc_id:
    • \n
    • facets:
    • \n
    • score:
    • \n
    • facet:
    • \n
    • facet_value:
    • \n
    \n\n
    Returns
    \n\n
    \n
    Returns document object\n
    \n
    \n", "signature": "(\tself,\tdoc_id=None,\tfacets=None,\tscore=None,\tfacet=None,\tfacet_value=None) -> debeir.core.document.Document:", "funcdef": "def"}, {"fullname": "debeir.core.document.Document.to_trec_format", "modulename": "debeir.core.document", "qualname": "Document.to_trec_format", "kind": "function", "doc": "

    Returns TREC format for the document

    \n\n
    Returns
    \n\n
    \n
    A trec formatted string\n
    \n
    \n", "signature": "(self, rank, run_name) -> str:", "funcdef": "def"}, {"fullname": "debeir.core.document.Document.get_trec_format", "modulename": "debeir.core.document", "qualname": "Document.get_trec_format", "kind": "function", "doc": "

    Get the trec format of a list of ranked documents. This function is a generator.

    \n\n
    Parameters
    \n\n
      \n
    • ranked_list: A list of Document-type objects
    • \n
    • run_name: Run name to print in the TREC formatted string
    • \n
    • sort: Whether to sort the input list in descending order of score.
    • \n
    • sorting_func: Custom sorting function will be used if provided
    • \n
    \n", "signature": "(\tcls,\tranked_list: List[debeir.core.document.Document],\trun_name='NO_RUN_NAME',\tsort=True,\tsorting_func=None):", "funcdef": "def"}, {"fullname": "debeir.core.document.ElasticsearchDocument", "modulename": "debeir.core.document", "qualname": "ElasticsearchDocument", "kind": "class", "doc": "

    Generic Document class.\nUsed as an interface for interacting across multiple indexes with different mappings.

    \n", "bases": "Document"}, {"fullname": "debeir.core.document.ElasticsearchDocument.from_results", "modulename": "debeir.core.document", "qualname": "ElasticsearchDocument.from_results", "kind": "function", "doc": "

    Produces a list of Document objects from raw results returned from the index

    \n\n

    In the format {topic_num: [Document, ..., Document]}

    \n", "signature": "(\tcls,\tresults,\tquery_cls,\tignore_facets=True,\t*args,\t**kwargs) -> Dict[Union[int, float], debeir.core.document.Document]:", "funcdef": "def"}, {"fullname": "debeir.core.executor", "modulename": "debeir.core.executor", "kind": "module", "doc": "

    \n"}, {"fullname": "debeir.core.executor.GenericElasticsearchExecutor", "modulename": "debeir.core.executor", "qualname": "GenericElasticsearchExecutor", "kind": "class", "doc": "

    Generic Executor class for Elasticsearch

    \n", "bases": "debeir.engines.elasticsearch.executor.ElasticsearchExecutor"}, {"fullname": "debeir.core.executor.GenericElasticsearchExecutor.__init__", "modulename": "debeir.core.executor", "qualname": "GenericElasticsearchExecutor.__init__", "kind": "function", "doc": "

    \n", "signature": "(\ttopics: Dict[Union[str, int], Dict[str, str]],\tclient: elasticsearch.AsyncElasticsearch,\tindex_name: str,\toutput_file: str,\tquery: debeir.core.query.GenericElasticsearchQuery,\tencoder: Optional[debeir.rankers.transformer_sent_encoder.Encoder] = None,\tconfig=None,\t*args,\t**kwargs)"}, {"fullname": "debeir.core.executor.GenericElasticsearchExecutor.generate_query", "modulename": "debeir.core.executor", "qualname": "GenericElasticsearchExecutor.generate_query", "kind": "function", "doc": "

    Generates a standard BM25 query given the topic number

    \n\n
    Parameters
    \n\n
      \n
    • topic_num: Query topic number to generate
    • \n
    • best_fields: Whether to use a curated list of fields
    • \n
    • kwargs:
    • \n
    \n\n
    Returns
    \n", "signature": "(self, topic_num, best_fields=True, **kwargs):", "funcdef": "def"}, {"fullname": "debeir.core.executor.GenericElasticsearchExecutor.generate_embedding_query", "modulename": "debeir.core.executor", "qualname": "GenericElasticsearchExecutor.generate_embedding_query", "kind": "function", "doc": "

    Executes an NIR-style query with combined scoring.

    \n\n
    Parameters
    \n\n
      \n
    • topic_num:
    • \n
    • cosine_weights:
    • \n
    • query_weights:
    • \n
    • norm_weight:
    • \n
    • automatic_scores:
    • \n
    • kwargs:
    • \n
    \n\n
    Returns
    \n", "signature": "(\tself,\ttopic_num,\tcosine_weights=None,\tquery_weights=None,\tnorm_weight=2.15,\tautomatic_scores=None,\t**kwargs):", "funcdef": "def"}, {"fullname": "debeir.core.executor.GenericElasticsearchExecutor.execute_query", "modulename": "debeir.core.executor", "qualname": "GenericElasticsearchExecutor.execute_query", "kind": "function", "doc": "

    Execute a query given parameters

    \n\n
    Parameters
    \n\n
      \n
    • args:
    • \n
    • kwargs:
    • \n
    \n", "signature": "(\tself,\tquery=None,\treturn_size: int = None,\treturn_id_only: bool = None,\ttopic_num=None,\tablation=False,\tquery_type=None,\t**kwargs):", "funcdef": "async def"}, {"fullname": "debeir.core.executor.GenericElasticsearchExecutor.run_automatic_adjustment", "modulename": "debeir.core.executor", "qualname": "GenericElasticsearchExecutor.run_automatic_adjustment", "kind": "function", "doc": "

    Get the normalization constant to be used in NIR-style queries for all topics given an initial\nrun of BM25 results.

    \n", "signature": "(self, return_results=False):", "funcdef": "async def"}, {"fullname": "debeir.core.executor.GenericElasticsearchExecutor.build_from_config", "modulename": "debeir.core.executor", "qualname": "GenericElasticsearchExecutor.build_from_config", "kind": "function", "doc": "

    Build an query executor engine from a config file.

    \n", "signature": "(\tcls,\ttopics: Dict,\tquery_obj: debeir.core.query.GenericElasticsearchQuery,\tclient,\tconfig: debeir.core.config.GenericConfig,\tnir_config: debeir.core.config.NIRConfig):", "funcdef": "def"}, {"fullname": "debeir.core.indexer", "modulename": "debeir.core.indexer", "kind": "module", "doc": "

    \n"}, {"fullname": "debeir.core.indexer.Indexer", "modulename": "debeir.core.indexer", "qualname": "Indexer", "kind": "class", "doc": "

    \n"}, {"fullname": "debeir.core.indexer.Indexer.__init__", "modulename": "debeir.core.indexer", "qualname": "Indexer.__init__", "kind": "function", "doc": "

    \n", "signature": "(client)"}, {"fullname": "debeir.core.indexer.Indexer.get_field", "modulename": "debeir.core.indexer", "qualname": "Indexer.get_field", "kind": "function", "doc": "

    \n", "signature": "(self, document, field):", "funcdef": "def"}, {"fullname": "debeir.core.indexer.SemanticElasticsearchIndexer", "modulename": "debeir.core.indexer", "qualname": "SemanticElasticsearchIndexer", "kind": "class", "doc": "

    Create a NIR-style index, with dense field representations with provided sentence encoder\nAssumes you've already indexed to start with.

    \n", "bases": "Indexer, threading.Thread"}, {"fullname": "debeir.core.indexer.SemanticElasticsearchIndexer.__init__", "modulename": "debeir.core.indexer", "qualname": "SemanticElasticsearchIndexer.__init__", "kind": "function", "doc": "

    This constructor should always be called with keyword arguments. Arguments are:

    \n\n

    group should be None; reserved for future extension when a ThreadGroup\nclass is implemented.

    \n\n

    target is the callable object to be invoked by the run()\nmethod. Defaults to None, meaning nothing is called.

    \n\n

    name is the thread name. By default, a unique name is constructed of\nthe form \"Thread-N\" where N is a small decimal number.

    \n\n

    args is the argument tuple for the target invocation. Defaults to ().

    \n\n

    kwargs is a dictionary of keyword arguments for the target\ninvocation. Defaults to {}.

    \n\n

    If a subclass overrides the constructor, it must make sure to invoke\nthe base class constructor (Thread.__init__()) before doing anything\nelse to the thread.

    \n", "signature": "(\tes_client: elasticsearch.Elasticsearch,\tencoder: debeir.rankers.transformer_sent_encoder.Encoder,\tindex: str,\tfields_to_encode: List[str],\tqueue: queue.Queue)"}, {"fullname": "debeir.core.indexer.SemanticElasticsearchIndexer.update_mappings", "modulename": "debeir.core.indexer", "qualname": "SemanticElasticsearchIndexer.update_mappings", "kind": "function", "doc": "

    \n", "signature": "(self, index, fields, client: elasticsearch.Elasticsearch):", "funcdef": "def"}, {"fullname": "debeir.core.indexer.SemanticElasticsearchIndexer.get_field", "modulename": "debeir.core.indexer", "qualname": "SemanticElasticsearchIndexer.get_field", "kind": "function", "doc": "

    \n", "signature": "(self, document, field):", "funcdef": "def"}, {"fullname": "debeir.core.indexer.SemanticElasticsearchIndexer.index_document", "modulename": "debeir.core.indexer", "qualname": "SemanticElasticsearchIndexer.index_document", "kind": "function", "doc": "

    \n", "signature": "(self, document):", "funcdef": "def"}, {"fullname": "debeir.core.indexer.SemanticElasticsearchIndexer.run", "modulename": "debeir.core.indexer", "qualname": "SemanticElasticsearchIndexer.run", "kind": "function", "doc": "

    Method representing the thread's activity.

    \n\n

    You may override this method in a subclass. The standard run() method\ninvokes the callable object passed to the object's constructor as the\ntarget argument, if any, with sequential and keyword arguments taken\nfrom the args and kwargs arguments, respectively.

    \n", "signature": "(self):", "funcdef": "def"}, {"fullname": "debeir.core.parser", "modulename": "debeir.core.parser", "kind": "module", "doc": "

    \n"}, {"fullname": "debeir.core.parser.Parser", "modulename": "debeir.core.parser", "qualname": "Parser", "kind": "class", "doc": "

    Parser interface

    \n"}, {"fullname": "debeir.core.parser.Parser.__init__", "modulename": "debeir.core.parser", "qualname": "Parser.__init__", "kind": "function", "doc": "

    \n", "signature": "(id_field: object, parse_fields: List[str])"}, {"fullname": "debeir.core.parser.Parser.normalize", "modulename": "debeir.core.parser", "qualname": "Parser.normalize", "kind": "function", "doc": "

    Flatten the dictionary, i.e. from Dict[int, Dict] -> Dict[str, str_or_int]

    \n\n
    Parameters
    \n\n
      \n
    • input_dict:
    • \n
    \n\n
    Returns
    \n", "signature": "(cls, input_dict) -> Dict:", "funcdef": "def"}, {"fullname": "debeir.core.parser.Parser.get_topics", "modulename": "debeir.core.parser", "qualname": "Parser.get_topics", "kind": "function", "doc": "

    Instance method for getting topics, forwards instance self parameters to the _get_topics class method.

    \n", "signature": "(self, path, *args, **kwargs):", "funcdef": "def"}, {"fullname": "debeir.core.parser.PickleParser", "modulename": "debeir.core.parser", "qualname": "PickleParser", "kind": "class", "doc": "

    Load topics from a pickle file

    \n", "bases": "Parser"}, {"fullname": "debeir.core.parser.PickleParser.__init__", "modulename": "debeir.core.parser", "qualname": "PickleParser.__init__", "kind": "function", "doc": "

    \n", "signature": "(id_field: object, parse_fields: List[str])"}, {"fullname": "debeir.core.parser.XMLParser", "modulename": "debeir.core.parser", "qualname": "XMLParser", "kind": "class", "doc": "

    Load topics from an XML file

    \n", "bases": "Parser"}, {"fullname": "debeir.core.parser.XMLParser.__init__", "modulename": "debeir.core.parser", "qualname": "XMLParser.__init__", "kind": "function", "doc": "

    \n", "signature": "(id_field: str, parse_fields: List[str], topic_field_name: str)"}, {"fullname": "debeir.core.parser.XMLParser.unwrap", "modulename": "debeir.core.parser", "qualname": "XMLParser.unwrap", "kind": "function", "doc": "

    Converts defaultdict to dict and list of size 1 to just the element

    \n\n
    Parameters
    \n\n
      \n
    • doc_dict:
    • \n
    • key:
    • \n
    \n", "signature": "(cls, doc_dict, key):", "funcdef": "def"}, {"fullname": "debeir.core.parser.CSVParser", "modulename": "debeir.core.parser", "qualname": "CSVParser", "kind": "class", "doc": "

    Loads topics from a CSV file

    \n", "bases": "Parser"}, {"fullname": "debeir.core.parser.CSVParser.__init__", "modulename": "debeir.core.parser", "qualname": "CSVParser.__init__", "kind": "function", "doc": "

    \n", "signature": "(id_field=None, parse_fields=None)"}, {"fullname": "debeir.core.parser.TSVParser", "modulename": "debeir.core.parser", "qualname": "TSVParser", "kind": "class", "doc": "

    \n", "bases": "CSVParser"}, {"fullname": "debeir.core.parser.TSVParser.__init__", "modulename": "debeir.core.parser", "qualname": "TSVParser.__init__", "kind": "function", "doc": "

    \n", "signature": "(id_field: object, parse_fields: List[str])"}, {"fullname": "debeir.core.parser.JsonLinesParser", "modulename": "debeir.core.parser", "qualname": "JsonLinesParser", "kind": "class", "doc": "

    Loads topics from a jsonl file,\na JSON per line

    \n\n

    Provide parse_fields, id_field and whether to ignore full matches on json keys\nsecondary_id appends to the primary id as jsonlines are flattened structure and may contain duplicate ids.

    \n", "bases": "Parser"}, {"fullname": "debeir.core.parser.JsonLinesParser.__init__", "modulename": "debeir.core.parser", "qualname": "JsonLinesParser.__init__", "kind": "function", "doc": "

    \n", "signature": "(\tid_field: str,\tparse_fields: List[str],\tignore_full_match: bool = True,\tsecondary_id: str = None)"}, {"fullname": "debeir.core.pipeline", "modulename": "debeir.core.pipeline", "kind": "module", "doc": "

    \n"}, {"fullname": "debeir.core.pipeline.Pipeline", "modulename": "debeir.core.pipeline", "qualname": "Pipeline", "kind": "class", "doc": "

    \n"}, {"fullname": "debeir.core.pipeline.Pipeline.__init__", "modulename": "debeir.core.pipeline", "qualname": "Pipeline.__init__", "kind": "function", "doc": "

    \n", "signature": "(\tengine: debeir.core.executor.GenericElasticsearchExecutor,\tengine_name: str,\tmetrics_config,\tengine_config,\tnir_config,\trun_config: debeir.core.config.Config,\tcallbacks=None)"}, {"fullname": "debeir.core.pipeline.Pipeline.disable", "modulename": "debeir.core.pipeline", "qualname": "Pipeline.disable", "kind": "function", "doc": "

    \n", "signature": "(self, parts: list):", "funcdef": "def"}, {"fullname": "debeir.core.pipeline.Pipeline.build_from_config", "modulename": "debeir.core.pipeline", "qualname": "Pipeline.build_from_config", "kind": "function", "doc": "

    \n", "signature": "(cls, nir_config_fp, engine, config_fp) -> debeir.core.pipeline.Pipeline:", "funcdef": "def"}, {"fullname": "debeir.core.pipeline.Pipeline.run_pipeline", "modulename": "debeir.core.pipeline", "qualname": "Pipeline.run_pipeline", "kind": "function", "doc": "

    \n", "signature": "(self, *args, **kwargs):", "funcdef": "async def"}, {"fullname": "debeir.core.pipeline.NIRPipeline", "modulename": "debeir.core.pipeline", "qualname": "NIRPipeline", "kind": "class", "doc": "

    \n", "bases": "Pipeline"}, {"fullname": "debeir.core.pipeline.NIRPipeline.__init__", "modulename": "debeir.core.pipeline", "qualname": "NIRPipeline.__init__", "kind": "function", "doc": "

    \n", "signature": "(*args, **kwargs)"}, {"fullname": "debeir.core.pipeline.NIRPipeline.prehook", "modulename": "debeir.core.pipeline", "qualname": "NIRPipeline.prehook", "kind": "function", "doc": "

    \n", "signature": "(self):", "funcdef": "async def"}, {"fullname": "debeir.core.pipeline.NIRPipeline.run_engine", "modulename": "debeir.core.pipeline", "qualname": "NIRPipeline.run_engine", "kind": "function", "doc": "

    \n", "signature": "(self, *args, **kwargs):", "funcdef": "async def"}, {"fullname": "debeir.core.pipeline.NIRPipeline.posthook", "modulename": "debeir.core.pipeline", "qualname": "NIRPipeline.posthook", "kind": "function", "doc": "

    \n", "signature": "(self, *args, **kwargs):", "funcdef": "async def"}, {"fullname": "debeir.core.pipeline.NIRPipeline.run_pipeline", "modulename": "debeir.core.pipeline", "qualname": "NIRPipeline.run_pipeline", "kind": "function", "doc": "

    \n", "signature": "(self, *args, return_results=False, **kwargs):", "funcdef": "async def"}, {"fullname": "debeir.core.pipeline.NIRPipeline.register_callback", "modulename": "debeir.core.pipeline", "qualname": "NIRPipeline.register_callback", "kind": "function", "doc": "

    \n", "signature": "(self, cb):", "funcdef": "def"}, {"fullname": "debeir.core.pipeline.BM25Pipeline", "modulename": "debeir.core.pipeline", "qualname": "BM25Pipeline", "kind": "class", "doc": "

    \n", "bases": "NIRPipeline"}, {"fullname": "debeir.core.pipeline.BM25Pipeline.run_pipeline", "modulename": "debeir.core.pipeline", "qualname": "BM25Pipeline.run_pipeline", "kind": "function", "doc": "

    \n", "signature": "(self, *args, return_results=False, **kwargs):", "funcdef": "async def"}, {"fullname": "debeir.core.query", "modulename": "debeir.core.query", "kind": "module", "doc": "

    \n"}, {"fullname": "debeir.core.query.Query", "modulename": "debeir.core.query", "qualname": "Query", "kind": "class", "doc": "

    A query interface class

    \n\n
    Parameters
    \n\n
      \n
    • topics: Topics that the query will be composed of
    • \n
    • config: Config object that contains the settings for querying
    • \n
    \n"}, {"fullname": "debeir.core.query.Query.__init__", "modulename": "debeir.core.query", "qualname": "Query.__init__", "kind": "function", "doc": "

    \n", "signature": "(\ttopics: Dict[int, Dict[str, str]],\tconfig: debeir.core.config.GenericConfig)"}, {"fullname": "debeir.core.query.GenericElasticsearchQuery", "modulename": "debeir.core.query", "qualname": "GenericElasticsearchQuery", "kind": "class", "doc": "

    A generic elasticsearch query. Contains methods for NIR-style (embedding) queries and normal BM25 queries.\nRequires topics, configs to be included

    \n", "bases": "Query"}, {"fullname": "debeir.core.query.GenericElasticsearchQuery.__init__", "modulename": "debeir.core.query", "qualname": "GenericElasticsearchQuery.__init__", "kind": "function", "doc": "

    \n", "signature": "(\ttopics,\tconfig,\ttop_bm25_scores=None,\tmappings=None,\tid_mapping=None,\t*args,\t**kwargs)"}, {"fullname": "debeir.core.query.GenericElasticsearchQuery.generate_query", "modulename": "debeir.core.query", "qualname": "GenericElasticsearchQuery.generate_query", "kind": "function", "doc": "

    Generates a simple BM25 query based off the query facets. Searches over all the document facets.

    \n\n
    Parameters
    \n\n
      \n
    • topic_num:
    • \n
    • args:
    • \n
    • kwargs:
    • \n
    \n\n
    Returns
    \n", "signature": "(self, topic_num, *args, **kwargs):", "funcdef": "def"}, {"fullname": "debeir.core.query.GenericElasticsearchQuery.set_bm25_scores", "modulename": "debeir.core.query", "qualname": "GenericElasticsearchQuery.set_bm25_scores", "kind": "function", "doc": "

    Sets BM25 scores that are used for NIR-style scoring. The top BM25 score for each topic is used\nfor log normalization.

    \n\n

    Score = log(bm25)/log(z) + embed_score

    \n\n
    Parameters
    \n\n
      \n
    • scores: Top BM25 Scores of the form {topic_num: top_bm25_score}
    • \n
    \n", "signature": "(self, scores: Dict[Union[str, int], Union[int, float]]):", "funcdef": "def"}, {"fullname": "debeir.core.query.GenericElasticsearchQuery.has_bm25_scores", "modulename": "debeir.core.query", "qualname": "GenericElasticsearchQuery.has_bm25_scores", "kind": "function", "doc": "

    Checks if BM25 scores have been set

    \n\n
    Returns
    \n", "signature": "(self):", "funcdef": "def"}, {"fullname": "debeir.core.query.GenericElasticsearchQuery.generate_query_embedding", "modulename": "debeir.core.query", "qualname": "GenericElasticsearchQuery.generate_query_embedding", "kind": "function", "doc": "

    Generates an embedding script score query for Elasticsearch as part of the NIR scoring function.

    \n\n
    Parameters
    \n\n
      \n
    • topic_num: The topic number to search for
    • \n
    • encoder: The encoder that will be used for encoding the topics
    • \n
    • norm_weight: The BM25 log normalization constant
    • \n
    • ablations: Whether to execute ablation style queries (i.e. one query facet\nor one document facet at a time)
    • \n
    • cosine_ceiling: Cosine ceiling used for automatic z-log normalization parameter calculation
    • \n
    • args:
    • \n
    • kwargs: Pass disable_cache to disable encoder caching
    • \n
    \n\n
    Returns
    \n\n
    \n
    An elasticsearch script_score query\n
    \n
    \n", "signature": "(self, *args, **kwargs):", "funcdef": "def"}, {"fullname": "debeir.core.query.GenericElasticsearchQuery.get_id_mapping", "modulename": "debeir.core.query", "qualname": "GenericElasticsearchQuery.get_id_mapping", "kind": "function", "doc": "

    Get the document ID

    \n\n
    Parameters
    \n\n
      \n
    • hit: The raw document result
    • \n
    \n\n
    Returns
    \n\n
    \n
    The document's ID\n
    \n
    \n", "signature": "(cls, hit):", "funcdef": "def"}, {"fullname": "debeir.core.results", "modulename": "debeir.core.results", "kind": "module", "doc": "

    \n"}, {"fullname": "debeir.core.results.Results", "modulename": "debeir.core.results", "qualname": "Results", "kind": "class", "doc": "

    \n"}, {"fullname": "debeir.core.results.Results.__init__", "modulename": "debeir.core.results", "qualname": "Results.__init__", "kind": "function", "doc": "

    \n", "signature": "(results: List, query_cls, engine_name)"}, {"fullname": "debeir.core.results.Results.get_topic_ids", "modulename": "debeir.core.results", "qualname": "Results.get_topic_ids", "kind": "function", "doc": "

    \n", "signature": "(self):", "funcdef": "def"}, {"fullname": "debeir.datasets", "modulename": "debeir.datasets", "kind": "module", "doc": "

    Contains data_sets implemented from nir.core

    \n\n
      \n
    1. Parser (For reading data from files into a Dict object)
    2. \n
    3. Query object (Generating queries)\n
        \n
      • These query objects can be very lightweight containing only the mappings of the index.
      • \n
    4. \n
    \n"}, {"fullname": "debeir.datasets.bioreddit", "modulename": "debeir.datasets.bioreddit", "kind": "module", "doc": "

    \n"}, {"fullname": "debeir.datasets.bioreddit.BioRedditSubmissionParser", "modulename": "debeir.datasets.bioreddit", "qualname": "BioRedditSubmissionParser", "kind": "class", "doc": "

    Parser for the BioReddit Submission Dataset

    \n", "bases": "debeir.core.parser.CSVParser"}, {"fullname": "debeir.datasets.bioreddit.BioRedditSubmissionParser.get_topics", "modulename": "debeir.datasets.bioreddit", "qualname": "BioRedditSubmissionParser.get_topics", "kind": "function", "doc": "

    Instance method for getting topics, forwards instance self parameters to the _get_topics class method.

    \n", "signature": "(cls, csvfile) -> Dict[int, Dict[str, str]]:", "funcdef": "def"}, {"fullname": "debeir.datasets.bioreddit.BioRedditCommentParser", "modulename": "debeir.datasets.bioreddit", "qualname": "BioRedditCommentParser", "kind": "class", "doc": "

    Parser for the BioReddit Comment Dataset

    \n", "bases": "debeir.core.parser.CSVParser"}, {"fullname": "debeir.datasets.bioreddit.BioRedditCommentParser.get_topics", "modulename": "debeir.datasets.bioreddit", "qualname": "BioRedditCommentParser.get_topics", "kind": "function", "doc": "

    Instance method for getting topics, forwards instance self parameters to the _get_topics class method.

    \n", "signature": "(cls, csvfile) -> Dict[str, Dict[str, str]]:", "funcdef": "def"}, {"fullname": "debeir.datasets.bioreddit.BioRedditElasticsearchQuery", "modulename": "debeir.datasets.bioreddit", "qualname": "BioRedditElasticsearchQuery", "kind": "class", "doc": "

    Elasticsearch Query object for the BioReddit

    \n", "bases": "debeir.core.query.GenericElasticsearchQuery"}, {"fullname": "debeir.datasets.bioreddit.BioRedditElasticsearchQuery.__init__", "modulename": "debeir.datasets.bioreddit", "qualname": "BioRedditElasticsearchQuery.__init__", "kind": "function", "doc": "

    \n", "signature": "(topics, config, *args, **kwargs)"}, {"fullname": "debeir.datasets.clinical_trials", "modulename": "debeir.datasets.clinical_trials", "kind": "module", "doc": "

    \n"}, {"fullname": "debeir.datasets.clinical_trials.TrialsQueryConfig", "modulename": "debeir.datasets.clinical_trials", "qualname": "TrialsQueryConfig", "kind": "class", "doc": "

    \n", "bases": "debeir.core.config.GenericConfig"}, {"fullname": "debeir.datasets.clinical_trials.TrialsQueryConfig.__init__", "modulename": "debeir.datasets.clinical_trials", "qualname": "TrialsQueryConfig.__init__", "kind": "function", "doc": "

    \n", "signature": "(\tquery_type: str,\tindex: str = None,\tencoder_normalize: bool = True,\tablations: bool = False,\tnorm_weight: float = None,\tautomatic: bool = None,\tencoder: object = None,\tencoder_fp: str = None,\tquery_weights: List[float] = None,\tcosine_weights: List[float] = None,\tevaluate: bool = False,\tqrels: str = None,\tconfig_fn: str = None,\tquery_fn: str = None,\tparser_fn: str = None,\texecutor_fn: str = None,\tcosine_ceiling: float = None,\ttopics_path: str = None,\treturn_id_only: bool = False,\toverwrite_output_if_exists: bool = False,\toutput_file: str = None,\trun_name: str = None,\tquery_field_usage: str = None,\tembed_field_usage: str = None,\tfields: List[str] = None)"}, {"fullname": "debeir.datasets.clinical_trials.TrialsQueryConfig.validate", "modulename": "debeir.datasets.clinical_trials", "qualname": "TrialsQueryConfig.validate", "kind": "function", "doc": "

    Checks if query type is included, and checks if an encoder is included for embedding queries

    \n", "signature": "(self):", "funcdef": "def"}, {"fullname": "debeir.datasets.clinical_trials.TrialsQueryConfig.from_toml", "modulename": "debeir.datasets.clinical_trials", "qualname": "TrialsQueryConfig.from_toml", "kind": "function", "doc": "

    Instantiates a Config object from a toml file

    \n\n
    Parameters
    \n\n
      \n
    • fp: File path of the Config TOML file
    • \n
    • field_class: Class of the Config object to be instantiated
    • \n
    • args: Arguments to be passed to Config
    • \n
    • kwargs: Keyword arguments to be passed
    • \n
    \n\n
    Returns
    \n\n
    \n
    A instantiated and validated Config object.\n
    \n
    \n", "signature": "(cls, fp: str, *args, **kwargs) -> debeir.core.config.GenericConfig:", "funcdef": "def"}, {"fullname": "debeir.datasets.clinical_trials.TrialsQueryConfig.from_dict", "modulename": "debeir.datasets.clinical_trials", "qualname": "TrialsQueryConfig.from_dict", "kind": "function", "doc": "

    Instantiates a Config object from a dictionary

    \n\n
    Parameters
    \n\n
      \n
    • data_class:
    • \n
    • kwargs:
    • \n
    \n\n
    Returns
    \n", "signature": "(cls, **kwargs) -> debeir.core.config.GenericConfig:", "funcdef": "def"}, {"fullname": "debeir.datasets.clinical_trials.TrialsElasticsearchQuery", "modulename": "debeir.datasets.clinical_trials", "qualname": "TrialsElasticsearchQuery", "kind": "class", "doc": "

    Elasticsearch Query object for the Clinical Trials Index

    \n", "bases": "debeir.core.query.GenericElasticsearchQuery"}, {"fullname": "debeir.datasets.clinical_trials.TrialsElasticsearchQuery.__init__", "modulename": "debeir.datasets.clinical_trials", "qualname": "TrialsElasticsearchQuery.__init__", "kind": "function", "doc": "

    \n", "signature": "(topics, query_type, config=None, *args, **kwargs)"}, {"fullname": "debeir.datasets.clinical_trials.TrialsElasticsearchQuery.generate_query", "modulename": "debeir.datasets.clinical_trials", "qualname": "TrialsElasticsearchQuery.generate_query", "kind": "function", "doc": "

    Generates a query for the clinical trials index

    \n\n
    Parameters
    \n\n
      \n
    • topic_num: Topic number to search
    • \n
    • query_field_usage: Which document facets to search over
    • \n
    • kwargs:
    • \n
    \n\n
    Returns
    \n\n
    \n
    A basic elasticsearch query for clinical trials\n
    \n
    \n", "signature": "(self, *args, **kwargs):", "funcdef": "def"}, {"fullname": "debeir.datasets.clinical_trials.TrialsElasticsearchQuery.generate_query_ablation", "modulename": "debeir.datasets.clinical_trials", "qualname": "TrialsElasticsearchQuery.generate_query_ablation", "kind": "function", "doc": "

    Only search one document facet at a time

    \n\n
    Parameters
    \n\n
      \n
    • topic_num:
    • \n
    • kwargs:
    • \n
    \n\n
    Returns
    \n", "signature": "(self, topic_num, **kwargs):", "funcdef": "def"}, {"fullname": "debeir.datasets.clinical_trials.TrialsElasticsearchQuery.generate_query_embedding", "modulename": "debeir.datasets.clinical_trials", "qualname": "TrialsElasticsearchQuery.generate_query_embedding", "kind": "function", "doc": "

    Computes the NIR score for a given topic

    \n\n

    Score = log(BM25)/log(norm_weight) + embedding_score

    \n\n
    Parameters
    \n\n
      \n
    • topic_num:
    • \n
    • encoder:
    • \n
    • query_field_usage:
    • \n
    • embed_field_usage:
    • \n
    • cosine_weights:
    • \n
    • query_weight:
    • \n
    • norm_weight:
    • \n
    • ablations:
    • \n
    • automatic_scores:
    • \n
    • kwargs:
    • \n
    \n\n
    Returns
    \n", "signature": "(self, *args, **kwargs):", "funcdef": "def"}, {"fullname": "debeir.datasets.clinical_trials.TrialsElasticsearchQuery.get_query_type", "modulename": "debeir.datasets.clinical_trials", "qualname": "TrialsElasticsearchQuery.get_query_type", "kind": "function", "doc": "

    \n", "signature": "(self, *args, **kwargs):", "funcdef": "def"}, {"fullname": "debeir.datasets.clinical_trials.TrialsElasticsearchQuery.get_id_mapping", "modulename": "debeir.datasets.clinical_trials", "qualname": "TrialsElasticsearchQuery.get_id_mapping", "kind": "function", "doc": "

    Get the document ID

    \n\n
    Parameters
    \n\n
      \n
    • hit: The raw document result
    • \n
    \n\n
    Returns
    \n\n
    \n
    The document's ID\n
    \n
    \n", "signature": "(self, hit):", "funcdef": "def"}, {"fullname": "debeir.datasets.clinical_trials.ClinicalTrialsElasticsearchExecutor", "modulename": "debeir.datasets.clinical_trials", "qualname": "ClinicalTrialsElasticsearchExecutor", "kind": "class", "doc": "

    Executes queries given a query object.

    \n", "bases": "debeir.core.executor.GenericElasticsearchExecutor"}, {"fullname": "debeir.datasets.clinical_trials.ClinicalTrialsElasticsearchExecutor.__init__", "modulename": "debeir.datasets.clinical_trials", "qualname": "ClinicalTrialsElasticsearchExecutor.__init__", "kind": "function", "doc": "

    \n", "signature": "(\ttopics: Dict[Union[str, int], Dict[str, str]],\tclient: elasticsearch.AsyncElasticsearch,\tindex_name: str,\toutput_file: str,\tquery: debeir.datasets.clinical_trials.TrialsElasticsearchQuery,\tencoder: Optional[debeir.rankers.transformer_sent_encoder.Encoder] = None,\tconfig=None,\t*args,\t**kwargs)"}, {"fullname": "debeir.datasets.clinical_trials.ClinicalTrialParser", "modulename": "debeir.datasets.clinical_trials", "qualname": "ClinicalTrialParser", "kind": "class", "doc": "

    Parser for Clinical Trials topics

    \n", "bases": "debeir.core.parser.Parser"}, {"fullname": "debeir.datasets.clinical_trials.ClinicalTrialParser.get_topics", "modulename": "debeir.datasets.clinical_trials", "qualname": "ClinicalTrialParser.get_topics", "kind": "function", "doc": "

    Instance method for getting topics, forwards instance self parameters to the _get_topics class method.

    \n", "signature": "(cls, csvfile) -> Dict[int, Dict[str, str]]:", "funcdef": "def"}, {"fullname": "debeir.datasets.factory", "modulename": "debeir.datasets.factory", "kind": "module", "doc": "

    \n"}, {"fullname": "debeir.datasets.factory.get_index_name", "modulename": "debeir.datasets.factory", "qualname": "get_index_name", "kind": "function", "doc": "

    Get the index name from the config without parsing as a TOML

    \n\n
    Parameters
    \n\n
      \n
    • config_fp:
    • \n
    \n\n
    Returns
    \n", "signature": "(config_fp):", "funcdef": "def"}, {"fullname": "debeir.datasets.factory.factory_fn", "modulename": "debeir.datasets.factory", "qualname": "factory_fn", "kind": "function", "doc": "

    Factory method for creating the parsed topics, config object, query object and query executor object

    \n\n
    Parameters
    \n\n
      \n
    • config_fp: Config file path
    • \n
    • index: Index to search
    • \n
    \n\n
    Returns
    \n\n
    \n
    Query, Config, Parser, Executor, Evaluator\n
    \n
    \n", "signature": "(\tconfig_fp,\tindex=None) -> (<class 'debeir.core.query.Query'>, <class 'debeir.core.config.GenericConfig'>, <class 'debeir.core.parser.Parser'>, <class 'debeir.core.executor.GenericElasticsearchExecutor'>, <class 'debeir.evaluation.evaluator.Evaluator'>):", "funcdef": "def"}, {"fullname": "debeir.datasets.factory.config_factory", "modulename": "debeir.datasets.factory", "qualname": "config_factory", "kind": "function", "doc": "

    Factory method for creating configs

    \n\n
    Parameters
    \n\n
      \n
    • path: Config path
    • \n
    • config_cls: Config class to instantiate
    • \n
    • args_dict: Arguments to consider
    • \n
    \n\n
    Returns
    \n\n
    \n
    A config object\n
    \n
    \n", "signature": "(\tpath: Union[str, pathlib.Path] = None,\tconfig_cls: Type[debeir.core.config.Config] = None,\targs_dict: Dict = None):", "funcdef": "def"}, {"fullname": "debeir.datasets.factory.get_nir_config", "modulename": "debeir.datasets.factory", "qualname": "get_nir_config", "kind": "function", "doc": "

    \n", "signature": "(nir_config, *args, ignore_errors=False, **kwargs):", "funcdef": "def"}, {"fullname": "debeir.datasets.factory.apply_nir_config", "modulename": "debeir.datasets.factory", "qualname": "apply_nir_config", "kind": "function", "doc": "

    Decorator that applies the NIR config settings to the current function\nReplaces arguments and keywords arguments with those found in the config

    \n\n
    Parameters
    \n\n
      \n
    • func:
    • \n
    \n\n
    Returns
    \n", "signature": "(func):", "funcdef": "def"}, {"fullname": "debeir.datasets.marco", "modulename": "debeir.datasets.marco", "kind": "module", "doc": "

    \n"}, {"fullname": "debeir.datasets.marco.MarcoElasticsearchExecutor", "modulename": "debeir.datasets.marco", "qualname": "MarcoElasticsearchExecutor", "kind": "class", "doc": "

    Generic Executor class for Elasticsearch

    \n", "bases": "debeir.core.executor.GenericElasticsearchExecutor"}, {"fullname": "debeir.datasets.marco.MarcoElasticsearchExecutor.__init__", "modulename": "debeir.datasets.marco", "qualname": "MarcoElasticsearchExecutor.__init__", "kind": "function", "doc": "

    \n", "signature": "(\ttopics: Dict[Union[str, int], Dict[str, str]],\tclient: elasticsearch.AsyncElasticsearch,\tindex_name: str,\toutput_file: str,\tquery: debeir.core.query.GenericElasticsearchQuery,\tencoder: Optional[debeir.rankers.transformer_sent_encoder.Encoder] = None,\tconfig=None,\t*args,\t**kwargs)"}, {"fullname": "debeir.datasets.marco.MarcoElasticsearchExecutor.generate_query", "modulename": "debeir.datasets.marco", "qualname": "MarcoElasticsearchExecutor.generate_query", "kind": "function", "doc": "

    Generates a standard BM25 query given the topic number

    \n\n
    Parameters
    \n\n
      \n
    • topic_num: Query topic number to generate
    • \n
    • best_fields: Whether to use a curated list of fields
    • \n
    • kwargs:
    • \n
    \n\n
    Returns
    \n", "signature": "(self, topic_num, best_fields=True, **kwargs):", "funcdef": "def"}, {"fullname": "debeir.datasets.marco.MarcoElasticsearchExecutor.generate_embedding_query", "modulename": "debeir.datasets.marco", "qualname": "MarcoElasticsearchExecutor.generate_embedding_query", "kind": "function", "doc": "

    Executes an NIR-style query with combined scoring.

    \n\n
    Parameters
    \n\n
      \n
    • topic_num:
    • \n
    • cosine_weights:
    • \n
    • query_weights:
    • \n
    • norm_weight:
    • \n
    • automatic_scores:
    • \n
    • kwargs:
    • \n
    \n\n
    Returns
    \n", "signature": "(\tself,\ttopic_num,\tcosine_weights=None,\tquery_weights=None,\tnorm_weight=2.15,\tautomatic_scores=None,\t**kwargs):", "funcdef": "def"}, {"fullname": "debeir.datasets.marco.MarcoElasticsearchExecutor.execute_query", "modulename": "debeir.datasets.marco", "qualname": "MarcoElasticsearchExecutor.execute_query", "kind": "function", "doc": "

    Execute a query given parameters

    \n\n
    Parameters
    \n\n
      \n
    • args:
    • \n
    • kwargs:
    • \n
    \n", "signature": "(\tself,\tquery=None,\ttopic_num=None,\tablation=False,\tquery_type='query',\t**kwargs):", "funcdef": "async def"}, {"fullname": "debeir.datasets.marco.MarcoQueryConfig", "modulename": "debeir.datasets.marco", "qualname": "MarcoQueryConfig", "kind": "class", "doc": "

    \n", "bases": "debeir.core.config.GenericConfig"}, {"fullname": "debeir.datasets.marco.MarcoQueryConfig.__init__", "modulename": "debeir.datasets.marco", "qualname": "MarcoQueryConfig.__init__", "kind": "function", "doc": "

    \n", "signature": "(\tquery_type: str,\tindex: str = None,\tencoder_normalize: bool = True,\tablations: bool = False,\tnorm_weight: float = None,\tautomatic: bool = None,\tencoder: object = None,\tencoder_fp: str = None,\tquery_weights: List[float] = None,\tcosine_weights: List[float] = None,\tevaluate: bool = False,\tqrels: str = None,\tconfig_fn: str = None,\tquery_fn: str = None,\tparser_fn: str = None,\texecutor_fn: str = None,\tcosine_ceiling: float = None,\ttopics_path: str = None,\treturn_id_only: bool = False,\toverwrite_output_if_exists: bool = False,\toutput_file: str = None,\trun_name: str = None)"}, {"fullname": "debeir.datasets.marco.MarcoQueryConfig.validate", "modulename": "debeir.datasets.marco", "qualname": "MarcoQueryConfig.validate", "kind": "function", "doc": "

    Validates if the config is correct.\nMust be implemented by inherited classes.

    \n", "signature": "(self):", "funcdef": "def"}, {"fullname": "debeir.datasets.marco.MarcoQueryConfig.from_toml", "modulename": "debeir.datasets.marco", "qualname": "MarcoQueryConfig.from_toml", "kind": "function", "doc": "

    Instantiates a Config object from a toml file

    \n\n
    Parameters
    \n\n
      \n
    • fp: File path of the Config TOML file
    • \n
    • field_class: Class of the Config object to be instantiated
    • \n
    • args: Arguments to be passed to Config
    • \n
    • kwargs: Keyword arguments to be passed
    • \n
    \n\n
    Returns
    \n\n
    \n
    A instantiated and validated Config object.\n
    \n
    \n", "signature": "(cls, fp: str, *args, **kwargs) -> debeir.datasets.marco.MarcoQueryConfig:", "funcdef": "def"}, {"fullname": "debeir.datasets.marco.MarcoQueryConfig.from_dict", "modulename": "debeir.datasets.marco", "qualname": "MarcoQueryConfig.from_dict", "kind": "function", "doc": "

    Instantiates a Config object from a dictionary

    \n\n
    Parameters
    \n\n
      \n
    • data_class:
    • \n
    • kwargs:
    • \n
    \n\n
    Returns
    \n", "signature": "(cls, **kwargs) -> debeir.datasets.marco.MarcoQueryConfig:", "funcdef": "def"}, {"fullname": "debeir.datasets.trec_clinical_trials", "modulename": "debeir.datasets.trec_clinical_trials", "kind": "module", "doc": "

    \n"}, {"fullname": "debeir.datasets.trec_clinical_trials.TREClinicalTrialDocumentParser", "modulename": "debeir.datasets.trec_clinical_trials", "qualname": "TREClinicalTrialDocumentParser", "kind": "class", "doc": "

    Parser for Clinical Trials topics

    \n", "bases": "debeir.core.parser.XMLParser"}, {"fullname": "debeir.datasets.trec_clinical_trials.TREClinicalTrialDocumentParser.extract", "modulename": "debeir.datasets.trec_clinical_trials", "qualname": "TREClinicalTrialDocumentParser.extract", "kind": "function", "doc": "

    \n", "signature": "(cls, path) -> Dict:", "funcdef": "def"}, {"fullname": "debeir.datasets.trec_clinical_trials.TrecClincialElasticsearchQuery", "modulename": "debeir.datasets.trec_clinical_trials", "qualname": "TrecClincialElasticsearchQuery", "kind": "class", "doc": "

    A generic elasticsearch query. Contains methods for NIR-style (embedding) queries and normal BM25 queries.\nRequires topics, configs to be included

    \n", "bases": "debeir.core.query.GenericElasticsearchQuery"}, {"fullname": "debeir.datasets.trec_clinical_trials.TrecClincialElasticsearchQuery.__init__", "modulename": "debeir.datasets.trec_clinical_trials", "qualname": "TrecClincialElasticsearchQuery.__init__", "kind": "function", "doc": "

    \n", "signature": "(topics, config, *args, **kwargs)"}, {"fullname": "debeir.datasets.trec_covid", "modulename": "debeir.datasets.trec_covid", "kind": "module", "doc": "

    \n"}, {"fullname": "debeir.datasets.trec_covid.TrecCovidParser", "modulename": "debeir.datasets.trec_covid", "qualname": "TrecCovidParser", "kind": "class", "doc": "

    Load topics from an XML file

    \n", "bases": "debeir.core.parser.XMLParser"}, {"fullname": "debeir.datasets.trec_covid.TrecCovidParser.get_topics", "modulename": "debeir.datasets.trec_covid", "qualname": "TrecCovidParser.get_topics", "kind": "function", "doc": "

    Instance method for getting topics, forwards instance self parameters to the _get_topics class method.

    \n", "signature": "(cls, xmlfile) -> Dict[int, Dict[str, str]]:", "funcdef": "def"}, {"fullname": "debeir.datasets.trec_covid.TrecElasticsearchQuery", "modulename": "debeir.datasets.trec_covid", "qualname": "TrecElasticsearchQuery", "kind": "class", "doc": "

    A generic elasticsearch query. Contains methods for NIR-style (embedding) queries and normal BM25 queries.\nRequires topics, configs to be included

    \n", "bases": "debeir.core.query.GenericElasticsearchQuery"}, {"fullname": "debeir.datasets.trec_covid.TrecElasticsearchQuery.__init__", "modulename": "debeir.datasets.trec_covid", "qualname": "TrecElasticsearchQuery.__init__", "kind": "function", "doc": "

    \n", "signature": "(topics, config, *args, **kwargs)"}, {"fullname": "debeir.datasets.types", "modulename": "debeir.datasets.types", "kind": "module", "doc": "

    \n"}, {"fullname": "debeir.datasets.types.InputExample", "modulename": "debeir.datasets.types", "qualname": "InputExample", "kind": "class", "doc": "

    Copied from Sentence Transformer Library\nStructure for one input example with texts, the label and a unique id

    \n"}, {"fullname": "debeir.datasets.types.InputExample.__init__", "modulename": "debeir.datasets.types", "qualname": "InputExample.__init__", "kind": "function", "doc": "

    Creates one InputExample with the given texts, guid and label

    \n\n

    :param guid\n id for the example\n:param texts\n the texts for the example. Note, str.strip() is called on the texts\n:param label\n the label for the example

    \n", "signature": "(\tguid: str = '',\ttexts: List[str] = None,\tlabel: Union[int, float] = 0)"}, {"fullname": "debeir.datasets.types.InputExample.get_label", "modulename": "debeir.datasets.types", "qualname": "InputExample.get_label", "kind": "function", "doc": "

    \n", "signature": "(self):", "funcdef": "def"}, {"fullname": "debeir.datasets.types.InputExample.to_dict", "modulename": "debeir.datasets.types", "qualname": "InputExample.to_dict", "kind": "function", "doc": "

    \n", "signature": "(cls, data: List[debeir.datasets.types.InputExample]):", "funcdef": "def"}, {"fullname": "debeir.datasets.types.InputExample.from_parser_output", "modulename": "debeir.datasets.types", "qualname": "InputExample.from_parser_output", "kind": "function", "doc": "

    \n", "signature": "(cls, data):", "funcdef": "def"}, {"fullname": "debeir.datasets.types.RelevanceExample", "modulename": "debeir.datasets.types", "qualname": "RelevanceExample", "kind": "class", "doc": "

    Converts Relevance Labels to 0 - 1

    \n", "bases": "InputExample"}, {"fullname": "debeir.datasets.types.RelevanceExample.__init__", "modulename": "debeir.datasets.types", "qualname": "RelevanceExample.__init__", "kind": "function", "doc": "

    Creates one InputExample with the given texts, guid and label

    \n\n

    :param guid\n id for the example\n:param texts\n the texts for the example. Note, str.strip() is called on the texts\n:param label\n the label for the example

    \n", "signature": "(max_score=2, *args, **kwargs)"}, {"fullname": "debeir.datasets.types.RelevanceExample.get_label", "modulename": "debeir.datasets.types", "qualname": "RelevanceExample.get_label", "kind": "function", "doc": "

    \n", "signature": "(self):", "funcdef": "def"}, {"fullname": "debeir.datasets.types.RelevanceExample.relevance", "modulename": "debeir.datasets.types", "qualname": "RelevanceExample.relevance", "kind": "function", "doc": "
    Returns
    \n\n
    \n
    Returns a normalised score for relevance between 0 - 1\n
    \n
    \n", "signature": "(self):", "funcdef": "def"}, {"fullname": "debeir.datasets.types.DatasetTypes", "modulename": "debeir.datasets.types", "qualname": "DatasetTypes", "kind": "class", "doc": "

    A collection of common dataset types that is usable in the library.

    \n", "bases": "enum.Enum"}, {"fullname": "debeir.datasets.utils", "modulename": "debeir.datasets.utils", "kind": "module", "doc": "

    \n"}, {"fullname": "debeir.datasets.utils.CrossValidatorDataset", "modulename": "debeir.datasets.utils", "qualname": "CrossValidatorDataset", "kind": "class", "doc": "

    Cross Validator Dataset

    \n"}, {"fullname": "debeir.datasets.utils.CrossValidatorDataset.__init__", "modulename": "debeir.datasets.utils", "qualname": "CrossValidatorDataset.__init__", "kind": "function", "doc": "

    \n", "signature": "(dataset, cross_validator, n_folds, x_attr='text', y_attr='label')"}, {"fullname": "debeir.datasets.utils.CrossValidatorDataset.prepare_cross_validator", "modulename": "debeir.datasets.utils", "qualname": "CrossValidatorDataset.prepare_cross_validator", "kind": "function", "doc": "

    Prepare the cross validator dataset object that will internally produce the folds.

    \n\n
    Parameters
    \n\n
      \n
    • data: Dataset to be used. Should be a list of dicts, or list of [x,y] or a Dataset object from data_sets
    • \n
    • evaluator: Evaluator to use for checking results
    • \n
    • n_splits: Number of cross validation splits, k-fold (stratified)
    • \n
    • seed: Seed to use (default 42)
    • \n
    • y_attr: Label, or idx of the y label
    • \n
    • x_attr: Label or idx of the x label (not directly used)
    • \n
    \n", "signature": "(\tcls,\tdata,\tevaluator: debeir.evaluation.evaluator.Evaluator,\tn_splits: int,\tx_attr,\ty_attr,\tseed=42) -> debeir.datasets.utils.CrossValidatorDataset:", "funcdef": "def"}, {"fullname": "debeir.datasets.utils.CrossValidatorDataset.get_fold", "modulename": "debeir.datasets.utils", "qualname": "CrossValidatorDataset.get_fold", "kind": "function", "doc": "

    Get the fold and returns a dataset.DataDict object with\nDataDict{'train': ..., 'val': ...}

    \n\n
    Parameters
    \n\n
      \n
    • idx:
    • \n
    \n", "signature": "(self, idx) -> datasets.dataset_dict.DatasetDict:", "funcdef": "def"}, {"fullname": "debeir.engines", "modulename": "debeir.engines", "kind": "module", "doc": "

    WIP

    \n\n

    Implemented Search Engines to run queries against.

    \n"}, {"fullname": "debeir.engines.client", "modulename": "debeir.engines.client", "kind": "module", "doc": "

    \n"}, {"fullname": "debeir.engines.client.Client", "modulename": "debeir.engines.client", "qualname": "Client", "kind": "class", "doc": "

    Overarching client interface object that contains references to different clients for search\nAllows sharing between function calls

    \n"}, {"fullname": "debeir.engines.client.Client.__init__", "modulename": "debeir.engines.client", "qualname": "Client.__init__", "kind": "function", "doc": "

    \n", "signature": "(\tes_client: elasticsearch.AsyncElasticsearch = None,\tsolr_client: object = None,\tgeneric_client: object = None)"}, {"fullname": "debeir.engines.client.Client.build_from_config", "modulename": "debeir.engines.client", "qualname": "Client.build_from_config", "kind": "function", "doc": "

    Build client from engine config

    \n\n
    Parameters
    \n\n
      \n
    • engine_type:
    • \n
    • engine_config:
    • \n
    \n\n
    Returns
    \n", "signature": "(cls, engine_type, engine_config) -> debeir.engines.client.Client:", "funcdef": "def"}, {"fullname": "debeir.engines.client.Client.get_client", "modulename": "debeir.engines.client", "qualname": "Client.get_client", "kind": "function", "doc": "

    \n", "signature": "(self, engine):", "funcdef": "def"}, {"fullname": "debeir.engines.client.Client.close", "modulename": "debeir.engines.client", "qualname": "Client.close", "kind": "function", "doc": "

    Generically close all contained client objects

    \n", "signature": "(self):", "funcdef": "async def"}, {"fullname": "debeir.engines.dummyindex", "modulename": "debeir.engines.dummyindex", "kind": "module", "doc": "

    \n"}, {"fullname": "debeir.engines.dummyindex.index", "modulename": "debeir.engines.dummyindex.index", "kind": "module", "doc": "

    \n"}, {"fullname": "debeir.engines.dummyindex.index.DummyIndex", "modulename": "debeir.engines.dummyindex.index", "qualname": "DummyIndex", "kind": "class", "doc": "

    \n"}, {"fullname": "debeir.engines.dummyindex.index.DummyIndex.__init__", "modulename": "debeir.engines.dummyindex.index", "qualname": "DummyIndex.__init__", "kind": "function", "doc": "

    \n", "signature": "()"}, {"fullname": "debeir.engines.dummyindex.index.DummyIndex.get_documents", "modulename": "debeir.engines.dummyindex.index", "qualname": "DummyIndex.get_documents", "kind": "function", "doc": "

    \n", "signature": "(self):", "funcdef": "def"}, {"fullname": "debeir.engines.dummyindex.index.DummyIndex.query", "modulename": "debeir.engines.dummyindex.index", "qualname": "DummyIndex.query", "kind": "function", "doc": "

    \n", "signature": "(self):", "funcdef": "def"}, {"fullname": "debeir.engines.dummyindex.index.DummyIndex.scorer", "modulename": "debeir.engines.dummyindex.index", "qualname": "DummyIndex.scorer", "kind": "function", "doc": "

    \n", "signature": "(self):", "funcdef": "def"}, {"fullname": "debeir.engines.dummyindex.index.es_isup", "modulename": "debeir.engines.dummyindex.index", "qualname": "es_isup", "kind": "function", "doc": "

    \n", "signature": "(es_client: elasticsearch.AsyncElasticsearch):", "funcdef": "async def"}, {"fullname": "debeir.engines.elasticsearch", "modulename": "debeir.engines.elasticsearch", "kind": "module", "doc": "

    Library code for interacting with the elasticsearch engine

    \n\n

    Contains many helper functions for asynchronous and fast querying, with optional caching available

    \n"}, {"fullname": "debeir.engines.elasticsearch.change_bm25", "modulename": "debeir.engines.elasticsearch.change_bm25", "kind": "module", "doc": "

    \n"}, {"fullname": "debeir.engines.elasticsearch.change_bm25.change_bm25_params", "modulename": "debeir.engines.elasticsearch.change_bm25", "qualname": "change_bm25_params", "kind": "function", "doc": "

    Change the BM25 parameters of the elasticsearch BM25 ranker.

    \n\n
    Parameters
    \n\n
      \n
    • index: The elasticsearch index name
    • \n
    • k1: The k parameter for BM25 (default 1.2) [Usually 0-3] [Term saturation constant] ->\nThe higher the k value, the more weight given to document that repeat terms.
    • \n
    • b: The b parameter for BM25 (default 0.75) [Usually 0-1] [Document length constant] ->\nThe higher the b value, the higher it penalises longer documents.
    • \n
    • base_url: The elasticsearch base URL for API requests (without index suffix)
    • \n
    \n", "signature": "(index, k1: float, b: float, base_url: str = 'http://localhost:9200'):", "funcdef": "def"}, {"fullname": "debeir.engines.elasticsearch.executor", "modulename": "debeir.engines.elasticsearch.executor", "kind": "module", "doc": "

    \n"}, {"fullname": "debeir.engines.elasticsearch.executor.ElasticsearchExecutor", "modulename": "debeir.engines.elasticsearch.executor", "qualname": "ElasticsearchExecutor", "kind": "class", "doc": "

    Executes an elasticsearch query given the query generated from the config, topics and query class object.

    \n\n

    Computes regular patterns of queries expected from general IR topics and indexes.\nIncludes:\n 1. Reranking\n 2. End-to-End Neural IR\n 3. Statistical keyword matching

    \n"}, {"fullname": "debeir.engines.elasticsearch.executor.ElasticsearchExecutor.__init__", "modulename": "debeir.engines.elasticsearch.executor", "qualname": "ElasticsearchExecutor.__init__", "kind": "function", "doc": "

    \n", "signature": "(\ttopics: Dict[Union[str, int], Dict[str, str]],\tclient: elasticsearch.AsyncElasticsearch,\tindex_name: str,\toutput_file: str,\tquery: debeir.core.query.GenericElasticsearchQuery,\tencoder: Optional[debeir.rankers.transformer_sent_encoder.Encoder],\treturn_size: int = 1000,\ttest=False,\treturn_id_only=True,\tconfig=None)"}, {"fullname": "debeir.engines.elasticsearch.executor.ElasticsearchExecutor.generate_query", "modulename": "debeir.engines.elasticsearch.executor", "qualname": "ElasticsearchExecutor.generate_query", "kind": "function", "doc": "

    Generates a query given a topic number from the list of topics

    \n\n
    Parameters
    \n\n
      \n
    • topic_num:
    • \n
    \n", "signature": "(self, topic_num):", "funcdef": "def"}, {"fullname": "debeir.engines.elasticsearch.executor.ElasticsearchExecutor.execute_query", "modulename": "debeir.engines.elasticsearch.executor", "qualname": "ElasticsearchExecutor.execute_query", "kind": "function", "doc": "

    Execute a query given parameters

    \n\n
    Parameters
    \n\n
      \n
    • args:
    • \n
    • kwargs:
    • \n
    \n", "signature": "(self, *args, **kwargs):", "funcdef": "def"}, {"fullname": "debeir.engines.elasticsearch.executor.ElasticsearchExecutor.run_all_queries", "modulename": "debeir.engines.elasticsearch.executor", "qualname": "ElasticsearchExecutor.run_all_queries", "kind": "function", "doc": "

    A generic function that will asynchronously run all topics using the execute_query() method

    \n\n
    Parameters
    \n\n
      \n
    • query_type: Which query to execute. Query_type determines which method is used to generate the queries\nfrom self.query.query_funcs: Dict[str, func]
    • \n
    • return_results: Whether to return raw results from the client. Useful for analysing results directly or\nfor computing the BM25 scores for log normalization in NIR-style scoring
    • \n
    • return_size: Number of documents to return. Overrides the config value if exists.
    • \n
    • return_id_only: Return the ID of the document only, rather than the full source document.
    • \n
    • args: Arguments to pass to the execute_query method
    • \n
    • kwargs: Keyword arguments to pass to the execute_query method
    • \n
    \n\n
    Returns
    \n\n
    \n
    A list of results if return_results = True else an empty list is returned.\n
    \n
    \n", "signature": "(\tself,\tquery_type=None,\treturn_results=False,\treturn_size: int = None,\treturn_id_only: bool = False,\t**kwargs) -> List:", "funcdef": "async def"}, {"fullname": "debeir.engines.elasticsearch.generate_script_score", "modulename": "debeir.engines.elasticsearch.generate_script_score", "kind": "module", "doc": "

    \n"}, {"fullname": "debeir.engines.elasticsearch.generate_script_score.SourceBuilder", "modulename": "debeir.engines.elasticsearch.generate_script_score", "qualname": "SourceBuilder", "kind": "class", "doc": "

    Builds Script Score source for NIR-style queries in elasticsearch\nUses the painless language

    \n\n

    This is a string builder class

    \n"}, {"fullname": "debeir.engines.elasticsearch.generate_script_score.SourceBuilder.__init__", "modulename": "debeir.engines.elasticsearch.generate_script_score", "qualname": "SourceBuilder.__init__", "kind": "function", "doc": "

    \n", "signature": "()"}, {"fullname": "debeir.engines.elasticsearch.generate_script_score.SourceBuilder.add_preamble", "modulename": "debeir.engines.elasticsearch.generate_script_score", "qualname": "SourceBuilder.add_preamble", "kind": "function", "doc": "

    Adds preamble to the internal string\nThis will return the bm25 score if the normalization constant is below 0

    \n", "signature": "(self):", "funcdef": "def"}, {"fullname": "debeir.engines.elasticsearch.generate_script_score.SourceBuilder.add_log_score", "modulename": "debeir.engines.elasticsearch.generate_script_score", "qualname": "SourceBuilder.add_log_score", "kind": "function", "doc": "

    Adds the BM25 log score line

    \n\n
    Parameters
    \n\n
      \n
    • ignore_below_one: Ignore all scores below 1.0 as Log(1) = 0. Otherwise, just ignore Log(0 and under).
    • \n
    \n\n
    Returns
    \n\n
    \n
    SourceBuilder\n
    \n
    \n", "signature": "(\tself,\tignore_below_one=False) -> debeir.engines.elasticsearch.generate_script_score.SourceBuilder:", "funcdef": "def"}, {"fullname": "debeir.engines.elasticsearch.generate_script_score.SourceBuilder.add_embed_field", "modulename": "debeir.engines.elasticsearch.generate_script_score", "qualname": "SourceBuilder.add_embed_field", "kind": "function", "doc": "

    Adds a cosine score line.

    \n\n
    Parameters
    \n\n
      \n
    • qfield: Query field
    • \n
    • field: Document facet field
    • \n
    \n\n
    Returns
    \n", "signature": "(\tself,\tqfield,\tfield) -> debeir.engines.elasticsearch.generate_script_score.SourceBuilder:", "funcdef": "def"}, {"fullname": "debeir.engines.elasticsearch.generate_script_score.SourceBuilder.finish", "modulename": "debeir.engines.elasticsearch.generate_script_score", "qualname": "SourceBuilder.finish", "kind": "function", "doc": "

    Finalises the script score and returns the internal string

    \n\n
    Returns
    \n\n
    \n
    A string containing the script score query\n
    \n
    \n", "signature": "(self):", "funcdef": "def"}, {"fullname": "debeir.engines.elasticsearch.generate_script_score.generate_source", "modulename": "debeir.engines.elasticsearch.generate_script_score", "qualname": "generate_source", "kind": "function", "doc": "

    Generates the script source based off a set of input fields and facets

    \n\n
    Parameters
    \n\n
      \n
    • qfields: Query fields (or topic fields)
    • \n
    • fields: Document facets to compute cosine similarity on
    • \n
    \n\n
    Returns
    \n", "signature": "(qfields: Union[list, str], fields) -> str:", "funcdef": "def"}, {"fullname": "debeir.engines.elasticsearch.generate_script_score.check_params_is_valid", "modulename": "debeir.engines.elasticsearch.generate_script_score", "qualname": "check_params_is_valid", "kind": "function", "doc": "

    Validate if the parameters for the script score passes a simple sanity check.

    \n\n
    Parameters
    \n\n
      \n
    • params:
    • \n
    • qfields:
    • \n
    \n", "signature": "(params, qfields):", "funcdef": "def"}, {"fullname": "debeir.engines.elasticsearch.generate_script_score.generate_script", "modulename": "debeir.engines.elasticsearch.generate_script_score", "qualname": "generate_script", "kind": "function", "doc": "

    Parameters for creating the script

    \n\n
    Parameters
    \n\n
      \n
    • fields: Document fields to search
    • \n
    • params: Parameters for the script
    • \n
    • source_generator: Function that will generate the script
    • \n
    • qfields: Query fields to search from (topic facets)
    • \n
    \n\n
    Returns
    \n", "signature": "(\tfields,\tparams,\tsource_generator=<function generate_source>,\tqfields='q_eb') -> Dict:", "funcdef": "def"}, {"fullname": "debeir.engines.solr", "modulename": "debeir.engines.solr", "kind": "module", "doc": "

    \n"}, {"fullname": "debeir.evaluation", "modulename": "debeir.evaluation", "kind": "module", "doc": "

    Evaluation for retrieved results.

    \n\n

    Works for TREC-style queries or for out-the-box returned results from the implemented search engines.

    \n"}, {"fullname": "debeir.evaluation.cross_validation", "modulename": "debeir.evaluation.cross_validation", "kind": "module", "doc": "

    \n"}, {"fullname": "debeir.evaluation.cross_validation.split_k_fold", "modulename": "debeir.evaluation.cross_validation", "qualname": "split_k_fold", "kind": "function", "doc": "

    \n", "signature": "(n_fold, data_files):", "funcdef": "def"}, {"fullname": "debeir.evaluation.cross_validation.CrossValidatorTypes", "modulename": "debeir.evaluation.cross_validation", "qualname": "CrossValidatorTypes", "kind": "class", "doc": "

    Cross Validator Strategies for separating the dataset

    \n", "bases": "enum.Enum"}, {"fullname": "debeir.evaluation.cross_validation.CrossValidatorTypes.Stratified", "modulename": "debeir.evaluation.cross_validation", "qualname": "CrossValidatorTypes.Stratified", "kind": "variable", "doc": "

    \n", "default_value": " = <CrossValidatorTypes.Stratified: 'StratifiedKFold'>"}, {"fullname": "debeir.evaluation.cross_validation.CrossValidatorTypes.KFold", "modulename": "debeir.evaluation.cross_validation", "qualname": "CrossValidatorTypes.KFold", "kind": "variable", "doc": "

    \n", "default_value": " = <CrossValidatorTypes.KFold: 'KFold'>"}, {"fullname": "debeir.evaluation.cross_validation.CrossValidator", "modulename": "debeir.evaluation.cross_validation", "qualname": "CrossValidator", "kind": "class", "doc": "

    Cross Validator Class for different types of data_sets

    \n\n

    E.g. List -> [[Data], label]\n List[Dict] -> {\"data\": Data, \"label\": label}\n Huggingface Dataset Object -> Data(set=\"train\", label = \"label\").select(idx)

    \n"}, {"fullname": "debeir.evaluation.cross_validation.CrossValidator.__init__", "modulename": "debeir.evaluation.cross_validation", "qualname": "CrossValidator.__init__", "kind": "function", "doc": "

    \n", "signature": "(\tdataset: Union[List, List[Dict], datasets.arrow_dataset.Dataset],\tx_idx_label_or_attr: Union[str, int],\ty_idx_label_or_attr: Union[str, int],\tcross_validator_type: [<class 'str'>, <enum 'CrossValidatorTypes'>] = <CrossValidatorTypes.Stratified: 'StratifiedKFold'>,\tseed=42,\tn_splits=5)"}, {"fullname": "debeir.evaluation.cross_validation.CrossValidator.get_fold", "modulename": "debeir.evaluation.cross_validation", "qualname": "CrossValidator.get_fold", "kind": "function", "doc": "
    Parameters
    \n\n
      \n
    • fold_num: Which fold to pick
    • \n
    \n\n
    Returns
    \n", "signature": "(self, fold_num: int):", "funcdef": "def"}, {"fullname": "debeir.evaluation.evaluator", "modulename": "debeir.evaluation.evaluator", "kind": "module", "doc": "

    \n"}, {"fullname": "debeir.evaluation.evaluator.Evaluator", "modulename": "debeir.evaluation.evaluator", "qualname": "Evaluator", "kind": "class", "doc": "

    Evaluation class for computing metrics from TREC-style files

    \n"}, {"fullname": "debeir.evaluation.evaluator.Evaluator.__init__", "modulename": "debeir.evaluation.evaluator", "qualname": "Evaluator.__init__", "kind": "function", "doc": "

    \n", "signature": "(qrels: str, metrics: List[str])"}, {"fullname": "debeir.evaluation.evaluator.Evaluator.evaluate_runs", "modulename": "debeir.evaluation.evaluator", "qualname": "Evaluator.evaluate_runs", "kind": "function", "doc": "

    Evaluates the TREC-style results from an input result list or file

    \n\n
    Parameters
    \n\n
      \n
    • res: Results file path or raw results list
    • \n
    • kwargs: Keyword arguments to pass to the underlying analysis_tools_ir.parse_run library
    • \n
    \n\n
    Returns
    \n", "signature": "(self, res: Union[str, List[str]], **kwargs):", "funcdef": "def"}, {"fullname": "debeir.evaluation.evaluator.Evaluator.average_all_metrics", "modulename": "debeir.evaluation.evaluator", "qualname": "Evaluator.average_all_metrics", "kind": "function", "doc": "

    Averages the metric per topic scores into a single averaged score.

    \n\n
    Parameters
    \n\n
      \n
    • runs: Parsed run dictionary: {metric_name@depth: Run object}
    • \n
    • logger: Logger to print metrics
    • \n
    \n", "signature": "(\tself,\truns: Dict,\tlogger: <loguru.logger handlers=[(id=0, level=10, sink=<_io.StringIO object at 0x103af2710>)]>):", "funcdef": "def"}, {"fullname": "debeir.evaluation.evaluator.Evaluator.sigtests", "modulename": "debeir.evaluation.evaluator", "qualname": "Evaluator.sigtests", "kind": "function", "doc": "

    Run a paired significance test on two result files

    \n\n
    Parameters
    \n\n
      \n
    • results_a:
    • \n
    • results_b:
    • \n
    \n\n
    Returns
    \n", "signature": "(self, results_a, results_b):", "funcdef": "def"}, {"fullname": "debeir.evaluation.evaluator.Evaluator.build_from_config", "modulename": "debeir.evaluation.evaluator", "qualname": "Evaluator.build_from_config", "kind": "function", "doc": "

    \n", "signature": "(\tcls,\tconfig: debeir.core.config.GenericConfig,\tmetrics_config: debeir.core.config.MetricsConfig):", "funcdef": "def"}, {"fullname": "debeir.evaluation.residual_scoring", "modulename": "debeir.evaluation.residual_scoring", "kind": "module", "doc": "

    \n"}, {"fullname": "debeir.evaluation.residual_scoring.ResidualEvaluator", "modulename": "debeir.evaluation.residual_scoring", "qualname": "ResidualEvaluator", "kind": "class", "doc": "

    Residual Scoring is the scoring of a subset of documents or the residiaul. The residual is created by removing documents from the collection and qrels.

    \n", "bases": "debeir.evaluation.evaluator.Evaluator"}, {"fullname": "debeir.evaluation.residual_scoring.ResidualEvaluator.__init__", "modulename": "debeir.evaluation.residual_scoring", "qualname": "ResidualEvaluator.__init__", "kind": "function", "doc": "

    Args:\n qrels (str): Path to qrels \n metrics (List[str]): A list of metrics with depth e.g. NDCG@1000\n filter_ids (Dict[str, List[str]]): A list of IDs to remove from the collection given by Dict[Topic_num, [Docids]]

    \n", "signature": "(qrels: str, metrics: List[str], filter_ids: Dict[str, List[str]])"}, {"fullname": "debeir.evaluation.residual_scoring.ResidualEvaluator.evaluate_runs", "modulename": "debeir.evaluation.residual_scoring", "qualname": "ResidualEvaluator.evaluate_runs", "kind": "function", "doc": "

    Run the residual evaluation for the runs

    \n\n
    Parameters
    \n\n
      \n
    • res: The results to run the evaluator against
    • \n
    • with_trec_binary: Use the TREC C binary instead of the default Python library, defaults to False
    • \n
    \n\n
    Returns
    \n\n
    \n

    A dictionary of supplied metrics of the results against the qrels

    \n
    \n", "signature": "(self, res: Union[str, List[str]], with_trec_binary=False, **kwargs):", "funcdef": "def"}, {"fullname": "debeir.models", "modulename": "debeir.models", "kind": "module", "doc": "

    \n"}, {"fullname": "debeir.models.colbert", "modulename": "debeir.models.colbert", "kind": "module", "doc": "

    \n"}, {"fullname": "debeir.models.colbert.CoLBERTConfig", "modulename": "debeir.models.colbert", "qualname": "CoLBERTConfig", "kind": "class", "doc": "

    \n"}, {"fullname": "debeir.models.colbert.CoLBERTConfig.__init__", "modulename": "debeir.models.colbert", "qualname": "CoLBERTConfig.__init__", "kind": "function", "doc": "

    \n", "signature": "(**kwargs)"}, {"fullname": "debeir.models.colbert.CoLBERTConfig.save", "modulename": "debeir.models.colbert", "qualname": "CoLBERTConfig.save", "kind": "function", "doc": "
    Parameters
    \n\n
      \n
    • fname: file name
    • \n
    • path: Path to save
    • \n
    \n", "signature": "(self, path, fname='colbert_config.json'):", "funcdef": "def"}, {"fullname": "debeir.models.colbert.CoLBERTConfig.load", "modulename": "debeir.models.colbert", "qualname": "CoLBERTConfig.load", "kind": "function", "doc": "

    Load the ColBERT config from path (don't point to file name just directory)

    \n\n
    Returns
    \n", "signature": "(cls, path, fname='colbert_config.json'):", "funcdef": "def"}, {"fullname": "debeir.models.colbert.ConvolutionalBlock", "modulename": "debeir.models.colbert", "qualname": "ConvolutionalBlock", "kind": "class", "doc": "

    Base class for all neural network modules.

    \n\n

    Your models should also subclass this class.

    \n\n

    Modules can also contain other Modules, allowing to nest them in\na tree structure. You can assign the submodules as regular attributes::

    \n\n
    import torch.nn as nn\nimport torch.nn.functional as F\n\nclass Model(nn.Module):\n    def __init__(self):\n        super().__init__()\n        self.conv1 = nn.Conv2d(1, 20, 5)\n        self.conv2 = nn.Conv2d(20, 20, 5)\n\n    def forward(self, x):\n        x = F.relu(self.conv1(x))\n        return F.relu(self.conv2(x))\n
    \n\n

    Submodules assigned in this way will be registered, and will have their\nparameters converted too when you call to(), etc.

    \n\n
    \n\n

    As per the example above, an __init__() call to the parent class\nmust be made before assignment on the child.

    \n\n
    \n\n

    :ivar training: Boolean represents whether this module is in training or\n evaluation mode.\n:vartype training: bool

    \n", "bases": "torch.nn.modules.module.Module"}, {"fullname": "debeir.models.colbert.ConvolutionalBlock.__init__", "modulename": "debeir.models.colbert", "qualname": "ConvolutionalBlock.__init__", "kind": "function", "doc": "

    Initializes internal Module state, shared by both nn.Module and ScriptModule.

    \n", "signature": "(\tin_channels,\tout_channels,\tkernel_size=1,\tfirst_stride=1,\tact_func=<class 'torch.nn.modules.activation.ReLU'>)"}, {"fullname": "debeir.models.colbert.ConvolutionalBlock.forward", "modulename": "debeir.models.colbert", "qualname": "ConvolutionalBlock.forward", "kind": "function", "doc": "

    Defines the computation performed at every call.

    \n\n

    Should be overridden by all subclasses.

    \n\n
    \n\n

    Although the recipe for forward pass needs to be defined within\nthis function, one should call the Module instance afterwards\ninstead of this since the former takes care of running the\nregistered hooks while the latter silently ignores them.

    \n\n
    \n", "signature": "(self, x):", "funcdef": "def"}, {"fullname": "debeir.models.colbert.KMaxPool", "modulename": "debeir.models.colbert", "qualname": "KMaxPool", "kind": "class", "doc": "

    Base class for all neural network modules.

    \n\n

    Your models should also subclass this class.

    \n\n

    Modules can also contain other Modules, allowing to nest them in\na tree structure. You can assign the submodules as regular attributes::

    \n\n
    import torch.nn as nn\nimport torch.nn.functional as F\n\nclass Model(nn.Module):\n    def __init__(self):\n        super().__init__()\n        self.conv1 = nn.Conv2d(1, 20, 5)\n        self.conv2 = nn.Conv2d(20, 20, 5)\n\n    def forward(self, x):\n        x = F.relu(self.conv1(x))\n        return F.relu(self.conv2(x))\n
    \n\n

    Submodules assigned in this way will be registered, and will have their\nparameters converted too when you call to(), etc.

    \n\n
    \n\n

    As per the example above, an __init__() call to the parent class\nmust be made before assignment on the child.

    \n\n
    \n\n

    :ivar training: Boolean represents whether this module is in training or\n evaluation mode.\n:vartype training: bool

    \n", "bases": "torch.nn.modules.module.Module"}, {"fullname": "debeir.models.colbert.KMaxPool.__init__", "modulename": "debeir.models.colbert", "qualname": "KMaxPool.__init__", "kind": "function", "doc": "

    Initializes internal Module state, shared by both nn.Module and ScriptModule.

    \n", "signature": "(k=1)"}, {"fullname": "debeir.models.colbert.KMaxPool.forward", "modulename": "debeir.models.colbert", "qualname": "KMaxPool.forward", "kind": "function", "doc": "

    Defines the computation performed at every call.

    \n\n

    Should be overridden by all subclasses.

    \n\n
    \n\n

    Although the recipe for forward pass needs to be defined within\nthis function, one should call the Module instance afterwards\ninstead of this since the former takes care of running the\nregistered hooks while the latter silently ignores them.

    \n\n
    \n", "signature": "(self, x):", "funcdef": "def"}, {"fullname": "debeir.models.colbert.visualisation_dump", "modulename": "debeir.models.colbert", "qualname": "visualisation_dump", "kind": "function", "doc": "

    \n", "signature": "(argmax, input_tensors):", "funcdef": "def"}, {"fullname": "debeir.models.colbert.ResidualBlock", "modulename": "debeir.models.colbert", "qualname": "ResidualBlock", "kind": "class", "doc": "

    Base class for all neural network modules.

    \n\n

    Your models should also subclass this class.

    \n\n

    Modules can also contain other Modules, allowing to nest them in\na tree structure. You can assign the submodules as regular attributes::

    \n\n
    import torch.nn as nn\nimport torch.nn.functional as F\n\nclass Model(nn.Module):\n    def __init__(self):\n        super().__init__()\n        self.conv1 = nn.Conv2d(1, 20, 5)\n        self.conv2 = nn.Conv2d(20, 20, 5)\n\n    def forward(self, x):\n        x = F.relu(self.conv1(x))\n        return F.relu(self.conv2(x))\n
    \n\n

    Submodules assigned in this way will be registered, and will have their\nparameters converted too when you call to(), etc.

    \n\n
    \n\n

    As per the example above, an __init__() call to the parent class\nmust be made before assignment on the child.

    \n\n
    \n\n

    :ivar training: Boolean represents whether this module is in training or\n evaluation mode.\n:vartype training: bool

    \n", "bases": "torch.nn.modules.module.Module"}, {"fullname": "debeir.models.colbert.ResidualBlock.__init__", "modulename": "debeir.models.colbert", "qualname": "ResidualBlock.__init__", "kind": "function", "doc": "

    Initializes internal Module state, shared by both nn.Module and ScriptModule.

    \n", "signature": "(\tin_channels,\tout_channels,\toptional_shortcut=True,\tkernel_size=1,\tact_func=<class 'torch.nn.modules.activation.ReLU'>)"}, {"fullname": "debeir.models.colbert.ResidualBlock.forward", "modulename": "debeir.models.colbert", "qualname": "ResidualBlock.forward", "kind": "function", "doc": "

    Defines the computation performed at every call.

    \n\n

    Should be overridden by all subclasses.

    \n\n
    \n\n

    Although the recipe for forward pass needs to be defined within\nthis function, one should call the Module instance afterwards\ninstead of this since the former takes care of running the\nregistered hooks while the latter silently ignores them.

    \n\n
    \n", "signature": "(self, x):", "funcdef": "def"}, {"fullname": "debeir.models.colbert.ColBERT", "modulename": "debeir.models.colbert", "qualname": "ColBERT", "kind": "class", "doc": "

    Base class for all neural network modules.

    \n\n

    Your models should also subclass this class.

    \n\n

    Modules can also contain other Modules, allowing to nest them in\na tree structure. You can assign the submodules as regular attributes::

    \n\n
    import torch.nn as nn\nimport torch.nn.functional as F\n\nclass Model(nn.Module):\n    def __init__(self):\n        super().__init__()\n        self.conv1 = nn.Conv2d(1, 20, 5)\n        self.conv2 = nn.Conv2d(20, 20, 5)\n\n    def forward(self, x):\n        x = F.relu(self.conv1(x))\n        return F.relu(self.conv2(x))\n
    \n\n

    Submodules assigned in this way will be registered, and will have their\nparameters converted too when you call to(), etc.

    \n\n
    \n\n

    As per the example above, an __init__() call to the parent class\nmust be made before assignment on the child.

    \n\n
    \n\n

    :ivar training: Boolean represents whether this module is in training or\n evaluation mode.\n:vartype training: bool

    \n", "bases": "torch.nn.modules.module.Module"}, {"fullname": "debeir.models.colbert.ColBERT.__init__", "modulename": "debeir.models.colbert", "qualname": "ColBERT.__init__", "kind": "function", "doc": "

    Initializes internal Module state, shared by both nn.Module and ScriptModule.

    \n", "signature": "(\tbert_model_args,\tbert_model_kwargs,\tconfig: transformers.models.bert.configuration_bert.BertConfig,\tdevice: str,\tmax_seq_len: int = 128,\tk: int = 8,\toptional_shortcut: bool = True,\thidden_neurons: int = 2048,\tuse_batch_norms: bool = True,\tuse_trans_blocks: bool = False,\tresidual_kernel_size: int = 1,\tdropout_perc: float = 0.5,\tact_func='mish',\tloss_func='cross_entropy_loss',\t**kwargs)"}, {"fullname": "debeir.models.colbert.ColBERT.forward", "modulename": "debeir.models.colbert", "qualname": "ColBERT.forward", "kind": "function", "doc": "

    Defines the computation performed at every call.

    \n\n

    Should be overridden by all subclasses.

    \n\n
    \n\n

    Although the recipe for forward pass needs to be defined within\nthis function, one should call the Module instance afterwards\ninstead of this since the former takes care of running the\nregistered hooks while the latter silently ignores them.

    \n\n
    \n", "signature": "(self, *args, **kwargs):", "funcdef": "def"}, {"fullname": "debeir.models.colbert.ColBERT.from_config", "modulename": "debeir.models.colbert", "qualname": "ColBERT.from_config", "kind": "function", "doc": "

    \n", "signature": "(cls, *args, config_path):", "funcdef": "def"}, {"fullname": "debeir.models.colbert.ColBERT.from_pretrained", "modulename": "debeir.models.colbert", "qualname": "ColBERT.from_pretrained", "kind": "function", "doc": "

    \n", "signature": "(cls, output_dir, **kwargs):", "funcdef": "def"}, {"fullname": "debeir.models.colbert.ColBERT.save_pretrained", "modulename": "debeir.models.colbert", "qualname": "ColBERT.save_pretrained", "kind": "function", "doc": "

    \n", "signature": "(self, output_dir):", "funcdef": "def"}, {"fullname": "debeir.models.colbert.ComBERT", "modulename": "debeir.models.colbert", "qualname": "ComBERT", "kind": "class", "doc": "

    Base class for all neural network modules.

    \n\n

    Your models should also subclass this class.

    \n\n

    Modules can also contain other Modules, allowing to nest them in\na tree structure. You can assign the submodules as regular attributes::

    \n\n
    import torch.nn as nn\nimport torch.nn.functional as F\n\nclass Model(nn.Module):\n    def __init__(self):\n        super().__init__()\n        self.conv1 = nn.Conv2d(1, 20, 5)\n        self.conv2 = nn.Conv2d(20, 20, 5)\n\n    def forward(self, x):\n        x = F.relu(self.conv1(x))\n        return F.relu(self.conv2(x))\n
    \n\n

    Submodules assigned in this way will be registered, and will have their\nparameters converted too when you call to(), etc.

    \n\n
    \n\n

    As per the example above, an __init__() call to the parent class\nmust be made before assignment on the child.

    \n\n
    \n\n

    :ivar training: Boolean represents whether this module is in training or\n evaluation mode.\n:vartype training: bool

    \n", "bases": "torch.nn.modules.module.Module"}, {"fullname": "debeir.models.colbert.ComBERT.__init__", "modulename": "debeir.models.colbert", "qualname": "ComBERT.__init__", "kind": "function", "doc": "

    Initializes internal Module state, shared by both nn.Module and ScriptModule.

    \n", "signature": "(\tbert_model_args,\tbert_model_kwargs,\tconfig: transformers.models.bert.configuration_bert.BertConfig,\tdevice: str,\tmax_seq_len: int = 128,\tk: int = 8,\toptional_shortcut: bool = True,\thidden_neurons: int = 2048,\tuse_batch_norms: bool = True,\tuse_trans_blocks: bool = False,\tresidual_kernel_size: int = 1,\tdropout_perc: float = 0.5,\tact_func='mish',\tloss_func='cross_entropy_loss',\tnum_blocks=2,\t**kwargs)"}, {"fullname": "debeir.models.colbert.ComBERT.forward", "modulename": "debeir.models.colbert", "qualname": "ComBERT.forward", "kind": "function", "doc": "

    Defines the computation performed at every call.

    \n\n

    Should be overridden by all subclasses.

    \n\n
    \n\n

    Although the recipe for forward pass needs to be defined within\nthis function, one should call the Module instance afterwards\ninstead of this since the former takes care of running the\nregistered hooks while the latter silently ignores them.

    \n\n
    \n", "signature": "(self, *args, **kwargs):", "funcdef": "def"}, {"fullname": "debeir.models.colbert.ComBERT.from_config", "modulename": "debeir.models.colbert", "qualname": "ComBERT.from_config", "kind": "function", "doc": "

    \n", "signature": "(cls, *args, config_path):", "funcdef": "def"}, {"fullname": "debeir.models.colbert.ComBERT.from_pretrained", "modulename": "debeir.models.colbert", "qualname": "ComBERT.from_pretrained", "kind": "function", "doc": "

    \n", "signature": "(cls, output_dir, **kwargs):", "funcdef": "def"}, {"fullname": "debeir.models.colbert.ComBERT.save_pretrained", "modulename": "debeir.models.colbert", "qualname": "ComBERT.save_pretrained", "kind": "function", "doc": "

    \n", "signature": "(self, output_dir):", "funcdef": "def"}, {"fullname": "debeir.rankers", "modulename": "debeir.rankers", "kind": "module", "doc": "

    Rankers module.

    \n\n
    \n

    Includes runnable out-of-box training code\n Custom ranking loss functions (e.g. LambdaLoss, NDCGLoss)\n Includes custom rankers for reranking or NIR-style queries.

    \n
    \n"}, {"fullname": "debeir.rankers.reranking", "modulename": "debeir.rankers.reranking", "kind": "module", "doc": "

    \n"}, {"fullname": "debeir.rankers.reranking.nir", "modulename": "debeir.rankers.reranking.nir", "kind": "module", "doc": "

    NIR Reranker

    \n\n

    [Insert paper link here]

    \n"}, {"fullname": "debeir.rankers.reranking.nir.NIReRanker", "modulename": "debeir.rankers.reranking.nir", "qualname": "NIReRanker", "kind": "class", "doc": "

    Re-ranker which uses the NIR scoring method\n score = log(bm25)/log(z) + cosine_sum

    \n", "bases": "debeir.rankers.reranking.reranker.DocumentReRanker"}, {"fullname": "debeir.rankers.reranking.nir.NIReRanker.__init__", "modulename": "debeir.rankers.reranking.nir", "qualname": "NIReRanker.__init__", "kind": "function", "doc": "

    \n", "signature": "(\tquery,\tranked_list: List[debeir.core.document.Document],\tencoder: debeir.rankers.transformer_sent_encoder.Encoder,\tdistance_fn=<function cosine>,\tfacets_weights: Dict = None,\tpresort=False,\tfields_to_encode=None,\t*args,\t**kwargs)"}, {"fullname": "debeir.rankers.reranking.reranker", "modulename": "debeir.rankers.reranking.reranker", "kind": "module", "doc": "

    General re-ranking interfaces to be implemented by child classes.

    \n"}, {"fullname": "debeir.rankers.reranking.reranker.ReRanker", "modulename": "debeir.rankers.reranking.reranker", "qualname": "ReRanker", "kind": "class", "doc": "

    General interface for a reranking.

    \n\n

    Child classes should implement the abstract methods.

    \n"}, {"fullname": "debeir.rankers.reranking.reranker.ReRanker.__init__", "modulename": "debeir.rankers.reranking.reranker", "qualname": "ReRanker.__init__", "kind": "function", "doc": "

    \n", "signature": "(query, ranked_list: List, *args, **kwargs)"}, {"fullname": "debeir.rankers.reranking.reranker.ReRanker.rerank", "modulename": "debeir.rankers.reranking.reranker", "qualname": "ReRanker.rerank", "kind": "function", "doc": "

    Re-rank the passed ranked list based on implemented private _compute_scores method.

    \n\n
    Parameters
    \n\n
      \n
    • ranked_list:
    • \n
    \n\n
    Returns
    \n\n
    \n
    A ranked list in descending order of the score field (which will be the last item in the list)\n
    \n
    \n", "signature": "(self) -> List:", "funcdef": "def"}, {"fullname": "debeir.rankers.reranking.reranker.DocumentReRanker", "modulename": "debeir.rankers.reranking.reranker", "qualname": "DocumentReRanker", "kind": "class", "doc": "

    Reranking interface for a ranked list of Document objects.

    \n", "bases": "ReRanker"}, {"fullname": "debeir.rankers.reranking.reranker.DocumentReRanker.__init__", "modulename": "debeir.rankers.reranking.reranker", "qualname": "DocumentReRanker.__init__", "kind": "function", "doc": "

    \n", "signature": "(\tquery,\tranked_list: List[debeir.core.document.Document],\t*args,\t**kwargs)"}, {"fullname": "debeir.rankers.reranking.reranker.ReRankerPool", "modulename": "debeir.rankers.reranking.reranker", "qualname": "ReRankerPool", "kind": "class", "doc": "

    \n"}, {"fullname": "debeir.rankers.reranking.reranker.ReRankerPool.__init__", "modulename": "debeir.rankers.reranking.reranker", "qualname": "ReRankerPool.__init__", "kind": "function", "doc": "

    \n", "signature": "()"}, {"fullname": "debeir.rankers.reranking.use", "modulename": "debeir.rankers.reranking.use", "kind": "module", "doc": "

    \n"}, {"fullname": "debeir.rankers.reranking.use.USEReRanker", "modulename": "debeir.rankers.reranking.use", "qualname": "USEReRanker", "kind": "class", "doc": "

    Re-ranks based on the cosine_sum rather the complete NIR scoring

    \n", "bases": "debeir.rankers.reranking.nir.NIReRanker"}, {"fullname": "debeir.rankers.reranking.use.USEReRanker.__init__", "modulename": "debeir.rankers.reranking.use", "qualname": "USEReRanker.__init__", "kind": "function", "doc": "

    \n", "signature": "(*args, **kwargs)"}, {"fullname": "debeir.rankers.transformer_sent_encoder", "modulename": "debeir.rankers.transformer_sent_encoder", "kind": "module", "doc": "

    \n"}, {"fullname": "debeir.rankers.transformer_sent_encoder.Encoder", "modulename": "debeir.rankers.transformer_sent_encoder", "qualname": "Encoder", "kind": "class", "doc": "

    A wrapper for the Sentence Transformer Encoder used in Universal Sentence Embeddings (USE) for ranking or reranking.

    \n\n
    Parameters
    \n\n
      \n
    • model_path: The path to a sentence transformer or transformer model.
    • \n
    • normalize: Normalize the output vectors to unit length for dot product retrieval rather than cosine.
    • \n
    • spacy_model: the spacy or scispacy model to use for sentence boundary detection.
    • \n
    • max_length: Maximum input length for the spacy nlp model.
    • \n
    \n"}, {"fullname": "debeir.rankers.transformer_sent_encoder.Encoder.__init__", "modulename": "debeir.rankers.transformer_sent_encoder", "qualname": "Encoder.__init__", "kind": "function", "doc": "

    \n", "signature": "(\tmodel_path,\tnormalize=False,\tspacy_model='en_core_sci_md',\tmax_length=2000000)"}, {"fullname": "debeir.rankers.transformer_sent_encoder.Encoder.encode", "modulename": "debeir.rankers.transformer_sent_encoder", "qualname": "Encoder.encode", "kind": "function", "doc": "

    Computes sentence embeddings for a given topic, uses spacy for sentence segmentation.\nBy default, uses a cache to store previously computed vectors. Pass \"disable_cache\" as a kwarg to disable this.

    \n\n
    Parameters
    \n\n
      \n
    • topic: The topic (a list of sentences) to encode. Should be a raw string.
    • \n
    • disable_cache: keyword argument, pass as True to disable encoding caching.
    • \n
    \n\n
    Returns
    \n\n
    \n
    Returns a list of encoded tensors is returned.\n
    \n
    \n", "signature": "(self, topic: str) -> List:", "funcdef": "def"}, {"fullname": "debeir.training", "modulename": "debeir.training", "kind": "module", "doc": "

    Runnable out-of-the-box code for training re-rankers.

    \n"}, {"fullname": "debeir.training.evaluate_reranker", "modulename": "debeir.training.evaluate_reranker", "kind": "module", "doc": "

    \n"}, {"fullname": "debeir.training.evaluate_reranker.SentenceEvaluator", "modulename": "debeir.training.evaluate_reranker", "qualname": "SentenceEvaluator", "kind": "class", "doc": "

    Evaluation class for computing metrics from TREC-style files

    \n", "bases": "debeir.evaluation.evaluator.Evaluator"}, {"fullname": "debeir.training.evaluate_reranker.SentenceEvaluator.__init__", "modulename": "debeir.training.evaluate_reranker", "qualname": "SentenceEvaluator.__init__", "kind": "function", "doc": "

    \n", "signature": "(\tmodel: debeir.rankers.transformer_sent_encoder.Encoder,\tdataset: datasets.arrow_dataset.Dataset,\tparsed_topics: Dict[Union[str, int], Dict],\ttext_cols: List[str],\tquery_cols: List[str],\tid_col: str,\tdistance_fn: str,\tqrels: str,\tmetrics: List[str])"}, {"fullname": "debeir.training.evaluate_reranker.SentenceEvaluator.produce_ranked_lists", "modulename": "debeir.training.evaluate_reranker", "qualname": "SentenceEvaluator.produce_ranked_lists", "kind": "function", "doc": "

    \n", "signature": "(self):", "funcdef": "def"}, {"fullname": "debeir.training.hparm_tuning", "modulename": "debeir.training.hparm_tuning", "kind": "module", "doc": "

    Hyper parameter tuning library using Optuna and Wandb

    \n"}, {"fullname": "debeir.training.hparm_tuning.config", "modulename": "debeir.training.hparm_tuning.config", "kind": "module", "doc": "

    \n"}, {"fullname": "debeir.training.hparm_tuning.config.HparamConfig", "modulename": "debeir.training.hparm_tuning.config", "qualname": "HparamConfig", "kind": "class", "doc": "

    Hyperparameter configuration file

    \n\n

    Expects a dictionary of hyperparameters

    \n\n

    hparams: Dict\n{\n \"learning_rate\": {\n \"type\": float\n \"low\": 0.1\n \"high\": 1.0\n \"step\": 0.1\n # OR\n args: [0.1, 1.0, 0.1]\n },\n}

    \n", "bases": "debeir.core.config.Config"}, {"fullname": "debeir.training.hparm_tuning.config.HparamConfig.__init__", "modulename": "debeir.training.hparm_tuning.config", "qualname": "HparamConfig.__init__", "kind": "function", "doc": "

    \n", "signature": "(hparams: Dict[str, Dict])"}, {"fullname": "debeir.training.hparm_tuning.config.HparamConfig.from_json", "modulename": "debeir.training.hparm_tuning.config", "qualname": "HparamConfig.from_json", "kind": "function", "doc": "

    \n", "signature": "(cls, fp) -> debeir.training.hparm_tuning.config.HparamConfig:", "funcdef": "def"}, {"fullname": "debeir.training.hparm_tuning.config.HparamConfig.validate", "modulename": "debeir.training.hparm_tuning.config", "qualname": "HparamConfig.validate", "kind": "function", "doc": "

    Validates if the config is correct.\nMust be implemented by inherited classes.

    \n", "signature": "(self):", "funcdef": "def"}, {"fullname": "debeir.training.hparm_tuning.config.HparamConfig.parse_config_to_py", "modulename": "debeir.training.hparm_tuning.config", "qualname": "HparamConfig.parse_config_to_py", "kind": "function", "doc": "

    Parses configuration file into usable python objects

    \n", "signature": "(self):", "funcdef": "def"}, {"fullname": "debeir.training.hparm_tuning.optuna_rank", "modulename": "debeir.training.hparm_tuning.optuna_rank", "kind": "module", "doc": "

    \n"}, {"fullname": "debeir.training.hparm_tuning.optuna_rank.objective", "modulename": "debeir.training.hparm_tuning.optuna_rank", "qualname": "objective", "kind": "function", "doc": "

    \n", "signature": "(\ttrainer: debeir.training.hparm_tuning.trainer.Trainer,\ttrial: optuna.trial._trial.Trial):", "funcdef": "def"}, {"fullname": "debeir.training.hparm_tuning.optuna_rank.run_optuna_with_wandb", "modulename": "debeir.training.hparm_tuning.optuna_rank", "qualname": "run_optuna_with_wandb", "kind": "function", "doc": "

    Partially initialize the objective function with a trainer and hparams to optimize.

    \n\n

    Optimize using the optuna library.

    \n\n
    Parameters
    \n\n
      \n
    • trainer:
    • \n
    • n_trials:
    • \n
    • maximize_objective:
    • \n
    • wandb_kwargs:
    • \n
    \n\n
    Returns
    \n", "signature": "(\ttrainer,\tn_trials=100,\tn_jobs=1,\tmaximize_objective=True,\tsave_study_path='.',\twandb_kwargs=None):", "funcdef": "def"}, {"fullname": "debeir.training.hparm_tuning.optuna_rank.print_optuna_stats", "modulename": "debeir.training.hparm_tuning.optuna_rank", "qualname": "print_optuna_stats", "kind": "function", "doc": "

    \n", "signature": "(study: optuna.study.study.Study):", "funcdef": "def"}, {"fullname": "debeir.training.hparm_tuning.trainer", "modulename": "debeir.training.hparm_tuning.trainer", "kind": "module", "doc": "

    \n"}, {"fullname": "debeir.training.hparm_tuning.trainer.OptimizersWrapper", "modulename": "debeir.training.hparm_tuning.trainer", "qualname": "OptimizersWrapper", "kind": "class", "doc": "

    \n"}, {"fullname": "debeir.training.hparm_tuning.trainer.OptimizersWrapper.__init__", "modulename": "debeir.training.hparm_tuning.trainer", "qualname": "OptimizersWrapper.__init__", "kind": "function", "doc": "

    \n", "signature": "()"}, {"fullname": "debeir.training.hparm_tuning.trainer.Trainer", "modulename": "debeir.training.hparm_tuning.trainer", "qualname": "Trainer", "kind": "class", "doc": "

    Wrapper class for a trainer class.

    \n"}, {"fullname": "debeir.training.hparm_tuning.trainer.Trainer.__init__", "modulename": "debeir.training.hparm_tuning.trainer", "qualname": "Trainer.__init__", "kind": "function", "doc": "

    \n", "signature": "(model, evaluator_fn, dataset_loading_fn)"}, {"fullname": "debeir.training.hparm_tuning.trainer.Trainer.fit", "modulename": "debeir.training.hparm_tuning.trainer", "qualname": "Trainer.fit", "kind": "function", "doc": "

    \n", "signature": "(\tself,\tin_trial: optuna.trial._trial.Trial,\ttrain_dataset,\tval_dataset):", "funcdef": "def"}, {"fullname": "debeir.training.hparm_tuning.trainer.SentenceTransformerHparamTrainer", "modulename": "debeir.training.hparm_tuning.trainer", "qualname": "SentenceTransformerHparamTrainer", "kind": "class", "doc": "

    See Optuna documentation for types!

    \n", "bases": "Trainer"}, {"fullname": "debeir.training.hparm_tuning.trainer.SentenceTransformerHparamTrainer.__init__", "modulename": "debeir.training.hparm_tuning.trainer", "qualname": "SentenceTransformerHparamTrainer.__init__", "kind": "function", "doc": "

    \n", "signature": "(\tdataset_loading_fn,\tevaluator_fn,\thparams_config: debeir.training.hparm_tuning.config.HparamConfig)"}, {"fullname": "debeir.training.hparm_tuning.trainer.SentenceTransformerHparamTrainer.get_optuna_hparams", "modulename": "debeir.training.hparm_tuning.trainer", "qualname": "SentenceTransformerHparamTrainer.get_optuna_hparams", "kind": "function", "doc": "

    Get hyperparameters suggested by the optuna library

    \n\n
    Parameters
    \n\n
      \n
    • trial: The optuna trial object
    • \n
    • hparams: Optional, pass a dictionary of HparamType[Enum] objects
    • \n
    \n\n
    Returns
    \n", "signature": "(\tself,\ttrial: optuna.trial._trial.Trial,\thparams: Sequence[debeir.training.hparm_tuning.types.Hparam] = None):", "funcdef": "def"}, {"fullname": "debeir.training.hparm_tuning.trainer.SentenceTransformerHparamTrainer.build_kwargs_and_model", "modulename": "debeir.training.hparm_tuning.trainer", "qualname": "SentenceTransformerHparamTrainer.build_kwargs_and_model", "kind": "function", "doc": "

    \n", "signature": "(self, hparams: Dict):", "funcdef": "def"}, {"fullname": "debeir.training.hparm_tuning.trainer.SentenceTransformerHparamTrainer.fit", "modulename": "debeir.training.hparm_tuning.trainer", "qualname": "SentenceTransformerHparamTrainer.fit", "kind": "function", "doc": "

    \n", "signature": "(\tself,\tin_trial: optuna.trial._trial.Trial,\ttrain_dataset,\tval_dataset):", "funcdef": "def"}, {"fullname": "debeir.training.hparm_tuning.trainer.trial_callback", "modulename": "debeir.training.hparm_tuning.trainer", "qualname": "trial_callback", "kind": "function", "doc": "

    \n", "signature": "(trial, score, epoch, *args, **kwargs):", "funcdef": "def"}, {"fullname": "debeir.training.hparm_tuning.trainer.SentenceTransformerTrainer", "modulename": "debeir.training.hparm_tuning.trainer", "qualname": "SentenceTransformerTrainer", "kind": "class", "doc": "

    See Optuna documentation for types!

    \n", "bases": "SentenceTransformerHparamTrainer"}, {"fullname": "debeir.training.hparm_tuning.trainer.SentenceTransformerTrainer.__init__", "modulename": "debeir.training.hparm_tuning.trainer", "qualname": "SentenceTransformerTrainer.__init__", "kind": "function", "doc": "

    \n", "signature": "(\tdataset: Union[datasets.dataset_dict.DatasetDict, Dict[str, datasets.arrow_dataset.Dataset]],\thparams_config: debeir.training.hparm_tuning.config.HparamConfig,\tevaluator_fn=None,\tevaluator=None,\tuse_wandb=False)"}, {"fullname": "debeir.training.hparm_tuning.trainer.SentenceTransformerTrainer.fit", "modulename": "debeir.training.hparm_tuning.trainer", "qualname": "SentenceTransformerTrainer.fit", "kind": "function", "doc": "

    \n", "signature": "(self, **extra_kwargs):", "funcdef": "def"}, {"fullname": "debeir.training.hparm_tuning.types", "modulename": "debeir.training.hparm_tuning.types", "kind": "module", "doc": "

    \n"}, {"fullname": "debeir.training.hparm_tuning.types.Hparam", "modulename": "debeir.training.hparm_tuning.types", "qualname": "Hparam", "kind": "class", "doc": "

    \n"}, {"fullname": "debeir.training.hparm_tuning.types.Hparam.__init__", "modulename": "debeir.training.hparm_tuning.types", "qualname": "Hparam.__init__", "kind": "function", "doc": "

    \n", "signature": "()"}, {"fullname": "debeir.training.hparm_tuning.types.Hparam.suggest", "modulename": "debeir.training.hparm_tuning.types", "qualname": "Hparam.suggest", "kind": "function", "doc": "

    \n", "signature": "(self, *args, **kwargs):", "funcdef": "def"}, {"fullname": "debeir.training.hparm_tuning.types.HparamFloat", "modulename": "debeir.training.hparm_tuning.types", "qualname": "HparamFloat", "kind": "class", "doc": "

    \n", "bases": "Hparam"}, {"fullname": "debeir.training.hparm_tuning.types.HparamFloat.__init__", "modulename": "debeir.training.hparm_tuning.types", "qualname": "HparamFloat.__init__", "kind": "function", "doc": "

    \n", "signature": "(\tname: str,\tlow: float,\thigh: float,\tlog: bool = False,\tstep: float = None)"}, {"fullname": "debeir.training.hparm_tuning.types.HparamFloat.suggest", "modulename": "debeir.training.hparm_tuning.types", "qualname": "HparamFloat.suggest", "kind": "function", "doc": "

    \n", "signature": "(self, trial: optuna.trial._trial.Trial):", "funcdef": "def"}, {"fullname": "debeir.training.hparm_tuning.types.HparamInt", "modulename": "debeir.training.hparm_tuning.types", "qualname": "HparamInt", "kind": "class", "doc": "

    \n", "bases": "Hparam"}, {"fullname": "debeir.training.hparm_tuning.types.HparamInt.__init__", "modulename": "debeir.training.hparm_tuning.types", "qualname": "HparamInt.__init__", "kind": "function", "doc": "

    \n", "signature": "(name: str, low: int, high: int, log: bool = False, step: int = 1)"}, {"fullname": "debeir.training.hparm_tuning.types.HparamInt.suggest", "modulename": "debeir.training.hparm_tuning.types", "qualname": "HparamInt.suggest", "kind": "function", "doc": "

    \n", "signature": "(self, trial: optuna.trial._trial.Trial):", "funcdef": "def"}, {"fullname": "debeir.training.hparm_tuning.types.HparamCategorical", "modulename": "debeir.training.hparm_tuning.types", "qualname": "HparamCategorical", "kind": "class", "doc": "

    \n", "bases": "Hparam"}, {"fullname": "debeir.training.hparm_tuning.types.HparamCategorical.__init__", "modulename": "debeir.training.hparm_tuning.types", "qualname": "HparamCategorical.__init__", "kind": "function", "doc": "

    \n", "signature": "(name: str, choices: Sequence, func: str = 'suggest_categorical')"}, {"fullname": "debeir.training.hparm_tuning.types.HparamCategorical.suggest", "modulename": "debeir.training.hparm_tuning.types", "qualname": "HparamCategorical.suggest", "kind": "function", "doc": "

    \n", "signature": "(self, trial: optuna.trial._trial.Trial):", "funcdef": "def"}, {"fullname": "debeir.training.hparm_tuning.types.HparamUniform", "modulename": "debeir.training.hparm_tuning.types", "qualname": "HparamUniform", "kind": "class", "doc": "

    \n", "bases": "Hparam"}, {"fullname": "debeir.training.hparm_tuning.types.HparamUniform.__init__", "modulename": "debeir.training.hparm_tuning.types", "qualname": "HparamUniform.__init__", "kind": "function", "doc": "

    \n", "signature": "(name: str, low: float, high: float, func: str = 'suggest_uniform')"}, {"fullname": "debeir.training.hparm_tuning.types.HparamUniform.suggest", "modulename": "debeir.training.hparm_tuning.types", "qualname": "HparamUniform.suggest", "kind": "function", "doc": "

    \n", "signature": "(self, trial: optuna.trial._trial.Trial):", "funcdef": "def"}, {"fullname": "debeir.training.hparm_tuning.types.HparamLogUniform", "modulename": "debeir.training.hparm_tuning.types", "qualname": "HparamLogUniform", "kind": "class", "doc": "

    \n", "bases": "Hparam"}, {"fullname": "debeir.training.hparm_tuning.types.HparamLogUniform.__init__", "modulename": "debeir.training.hparm_tuning.types", "qualname": "HparamLogUniform.__init__", "kind": "function", "doc": "

    \n", "signature": "(name: str, low: float, high: float, func: str = 'suggest_loguniform')"}, {"fullname": "debeir.training.hparm_tuning.types.HparamLogUniform.suggest", "modulename": "debeir.training.hparm_tuning.types", "qualname": "HparamLogUniform.suggest", "kind": "function", "doc": "

    \n", "signature": "(self, trial: optuna.trial._trial.Trial):", "funcdef": "def"}, {"fullname": "debeir.training.hparm_tuning.types.HparamDiscreteUniform", "modulename": "debeir.training.hparm_tuning.types", "qualname": "HparamDiscreteUniform", "kind": "class", "doc": "

    \n", "bases": "Hparam"}, {"fullname": "debeir.training.hparm_tuning.types.HparamDiscreteUniform.__init__", "modulename": "debeir.training.hparm_tuning.types", "qualname": "HparamDiscreteUniform.__init__", "kind": "function", "doc": "

    \n", "signature": "(\tname: str,\tlow: float,\thigh: float,\tq: float,\tfunc: str = 'suggest_discrete_uniform')"}, {"fullname": "debeir.training.hparm_tuning.types.HparamDiscreteUniform.suggest", "modulename": "debeir.training.hparm_tuning.types", "qualname": "HparamDiscreteUniform.suggest", "kind": "function", "doc": "

    \n", "signature": "(self, trial: optuna.trial._trial.Trial):", "funcdef": "def"}, {"fullname": "debeir.training.losses", "modulename": "debeir.training.losses", "kind": "module", "doc": "

    \n"}, {"fullname": "debeir.training.losses.contrastive", "modulename": "debeir.training.losses.contrastive", "kind": "module", "doc": "

    Author: Yonglong Tian (yonglong@mit.edu)\nDate: May 07, 2020

    \n\n

    Code imported from: https://github.com/HobbitLong/SupContrast/blob/master/losses.py

    \n"}, {"fullname": "debeir.training.losses.contrastive.SupConLoss", "modulename": "debeir.training.losses.contrastive", "qualname": "SupConLoss", "kind": "class", "doc": "

    Supervised Contrastive Learning: https://arxiv.org/pdf/2004.11362.pdf.\nIt also supports the unsupervised contrastive loss in SimCLR

    \n", "bases": "torch.nn.modules.module.Module"}, {"fullname": "debeir.training.losses.contrastive.SupConLoss.__init__", "modulename": "debeir.training.losses.contrastive", "qualname": "SupConLoss.__init__", "kind": "function", "doc": "

    Initializes internal Module state, shared by both nn.Module and ScriptModule.

    \n", "signature": "(temperature=1.0, contrast_mode='all', base_temperature=1.0)"}, {"fullname": "debeir.training.losses.contrastive.SupConLoss.forward", "modulename": "debeir.training.losses.contrastive", "qualname": "SupConLoss.forward", "kind": "function", "doc": "

    Compute loss for model. If both labels and mask are None,\nit degenerates to SimCLR unsupervised loss:\nhttps://arxiv.org/pdf/2002.05709.pdf\nArgs:\n features: hidden vector of shape [bsz, n_views, ...].\n labels: ground truth of shape [bsz].\n mask: contrastive mask of shape [bsz, bsz], mask_{i,j}=1 if sample j\n has the same class as sample i. Can be asymmetric.\nReturns:\n A loss scalar.

    \n", "signature": "(self, features, labels=None, mask=None):", "funcdef": "def"}, {"fullname": "debeir.training.losses.contrastive.SiameseDistanceMetric", "modulename": "debeir.training.losses.contrastive", "qualname": "SiameseDistanceMetric", "kind": "class", "doc": "

    The metric for the contrastive loss

    \n", "bases": "enum.Enum"}, {"fullname": "debeir.training.losses.contrastive.SiameseDistanceMetric.EUCLIDEAN", "modulename": "debeir.training.losses.contrastive", "qualname": "SiameseDistanceMetric.EUCLIDEAN", "kind": "function", "doc": "

    \n", "signature": "(x, y):", "funcdef": "def"}, {"fullname": "debeir.training.losses.contrastive.SiameseDistanceMetric.MANHATTAN", "modulename": "debeir.training.losses.contrastive", "qualname": "SiameseDistanceMetric.MANHATTAN", "kind": "function", "doc": "

    \n", "signature": "(x, y):", "funcdef": "def"}, {"fullname": "debeir.training.losses.contrastive.SiameseDistanceMetric.COSINE_DISTANCE", "modulename": "debeir.training.losses.contrastive", "qualname": "SiameseDistanceMetric.COSINE_DISTANCE", "kind": "function", "doc": "

    \n", "signature": "(x, y):", "funcdef": "def"}, {"fullname": "debeir.training.losses.contrastive.ContrastiveSentLoss", "modulename": "debeir.training.losses.contrastive", "qualname": "ContrastiveSentLoss", "kind": "class", "doc": "

    Contrastive loss. Expects as input two texts and a label of either 0 or 1. If the label == 1, then the distance between the\ntwo embeddings is reduced. If the label == 0, then the distance between the embeddings is increased.\nFurther information: http://yann.lecun.com/exdb/publis/pdf/hadsell-chopra-lecun-06.pdf

    \n\n
    Parameters
    \n\n
      \n
    • model: SentenceTransformer model
    • \n
    • distance_metric: Function that returns a distance between two emeddings. The class SiameseDistanceMetric contains pre-defined metrices that can be used
    • \n
    • margin: Negative samples (label == 0) should have a distance of at least the margin value.
    • \n
    • size_average: Average by the size of the mini-batch.\nExample::\nfrom sentence_transformers import SentenceTransformer, LoggingHandler, losses, InputExample\nfrom torch.utils.data import DataLoader\nmodel = SentenceTransformer('all-MiniLM-L6-v2')\ntrain_examples = [\n InputExample(texts=['This is a positive pair', 'Where the distance will be minimized'], label=1),\n InputExample(texts=['This is a negative pair', 'Their distance will be increased'], label=0)]\ntrain_dataloader = DataLoader(train_examples, shuffle=True, batch_size=2)\ntrain_loss = losses.ContrastiveLoss(model=model)\nmodel.fit([(train_dataloader, train_loss)], show_progress_bar=True)
    • \n
    \n", "bases": "torch.nn.modules.module.Module"}, {"fullname": "debeir.training.losses.contrastive.ContrastiveSentLoss.__init__", "modulename": "debeir.training.losses.contrastive", "qualname": "ContrastiveSentLoss.__init__", "kind": "function", "doc": "

    Initializes internal Module state, shared by both nn.Module and ScriptModule.

    \n", "signature": "(\tmodel,\tdistance_metric=<function SiameseDistanceMetric.<lambda>>,\tmargin: float = 0.5,\tsize_average: bool = True)"}, {"fullname": "debeir.training.losses.contrastive.ContrastiveSentLoss.get_config_dict", "modulename": "debeir.training.losses.contrastive", "qualname": "ContrastiveSentLoss.get_config_dict", "kind": "function", "doc": "

    \n", "signature": "(self):", "funcdef": "def"}, {"fullname": "debeir.training.losses.contrastive.ContrastiveSentLoss.forward", "modulename": "debeir.training.losses.contrastive", "qualname": "ContrastiveSentLoss.forward", "kind": "function", "doc": "

    Defines the computation performed at every call.

    \n\n

    Should be overridden by all subclasses.

    \n\n
    \n\n

    Although the recipe for forward pass needs to be defined within\nthis function, one should call the Module instance afterwards\ninstead of this since the former takes care of running the\nregistered hooks while the latter silently ignores them.

    \n\n
    \n", "signature": "(\tself,\tsentence_features: Iterable[Dict[str, torch.Tensor]],\tlabels: torch.Tensor):", "funcdef": "def"}, {"fullname": "debeir.training.losses.ranking", "modulename": "debeir.training.losses.ranking", "kind": "module", "doc": "

    Losses are drawn from the allrank library

    \n"}, {"fullname": "debeir.training.train_reranker", "modulename": "debeir.training.train_reranker", "kind": "module", "doc": "

    \n"}, {"fullname": "debeir.training.train_reranker.train_cross_encoder_reranker", "modulename": "debeir.training.train_reranker", "qualname": "train_cross_encoder_reranker", "kind": "function", "doc": "

    Trains a reranker with relevance signals

    \n\n
    Parameters
    \n\n
      \n
    • model_fp_or_name: The model name or path to the model
    • \n
    • output_dir: Output directory to save model, logs etc.
    • \n
    • train_dataset: Training Examples
    • \n
    • dev_dataset: Dev examples
    • \n
    • train_batch_size: Training batch size
    • \n
    • num_epochs: Number of epochs
    • \n
    • warmup_steps: Warmup steps for the scheduler
    • \n
    • evaluate_every_n_step: Evaluate the model every n steps
    • \n
    • special_tokens: Special tokens to add, defaults to [DOC], [QRY] tokens (bi-encoder)
    • \n
    • pooling_mode: Pooling mode for a sentence transformer model
    • \n
    • loss_func: Loss function(s) to use
    • \n
    • evaluator: Evaluator to use
    • \n
    \n", "signature": "(\tmodel_fp_or_name: str,\toutput_dir: str,\ttrain_dataset: List[debeir.datasets.types.RelevanceExample],\tdev_dataset: List[debeir.datasets.types.RelevanceExample],\ttrain_batch_size=32,\tnum_epochs=3,\twarmup_steps=None,\tevaluate_every_n_step: int = 1000,\tspecial_tokens=None,\tpooling_mode=None,\tloss_func=None,\tevaluator: sentence_transformers.evaluation.SentenceEvaluator.SentenceEvaluator = None,\t*args,\t**kwargs):", "funcdef": "def"}, {"fullname": "debeir.training.train_sentence_encoder", "modulename": "debeir.training.train_sentence_encoder", "kind": "module", "doc": "

    \n"}, {"fullname": "debeir.training.train_sentence_encoder.train_biencoder", "modulename": "debeir.training.train_sentence_encoder", "qualname": "train_biencoder", "kind": "function", "doc": "

    Train a universal sentence encoder

    \n\n
    Parameters
    \n\n
      \n
    • model_fp_or_name: The model name or path to the model
    • \n
    • output_dir: Output directory to save model, logs etc.
    • \n
    • train_examples: Training Examples
    • \n
    • dev_examples: Dev examples
    • \n
    • train_batch_size: Training batch size
    • \n
    • num_epochs: Number of epochs
    • \n
    • warmup_steps: Warmup steps for the scheduler
    • \n
    • evaluate_every_n_step: Evaluate the model every n steps
    • \n
    • special_tokens: Special tokens to add
    • \n
    • pooling_mode: Pooling mode for a sentence transformer model
    • \n
    • loss_func: Loss function(s) to use
    • \n
    • evaluator: Evaluator to use
    • \n
    \n", "signature": "(\tmodel_fp_or_name: str,\toutput_dir: str,\ttrain_examples: List[debeir.datasets.types.InputExample],\tdev_examples: List[debeir.datasets.types.InputExample],\ttrain_batch_size=32,\tnum_epochs=3,\twarmup_steps=None,\tevaluate_every_n_step: int = 1000,\tspecial_tokens=None,\tpooling_mode=None,\tloss_func=None,\tevaluator: sentence_transformers.evaluation.SentenceEvaluator.SentenceEvaluator = None,\t*args,\t**kwargs):", "funcdef": "def"}, {"fullname": "debeir.training.train_sentence_encoder.train_huggingface_transformer", "modulename": "debeir.training.train_sentence_encoder", "qualname": "train_huggingface_transformer", "kind": "function", "doc": "

    Train a transformer model using the Huggingface API

    \n\n
    Parameters
    \n\n
      \n
    • model_fp_or_name_or_cls: Model name or model class to instantiate
    • \n
    • tokenizer: Tokenizer
    • \n
    • output_dir: Output directory to write to
    • \n
    • compute_metric_fn: Metric function to compute metrics
    • \n
    • metric: Metric used by the compute_metric_fn
    • \n
    • dataset: Huggingface Dataset Dict
    • \n
    • train_dataset: Training dataset to be used by the Trainer class
    • \n
    • eval_dataset: Evaluation dataset to be used by the Trainer class
    • \n
    • train_batch_size: Batch size to use for training
    • \n
    • num_epochs: Number of training epochs (default: 3)
    • \n
    • learning_rate: Learning rate (default: 5e-5)
    • \n
    • lr_scheduler_type: Learning rate type, see SchedulerType
    • \n
    • optimizer: Optimizer
    • \n
    • warmup_ratio: Warmup ratios as ratio of steps (default 0.1)
    • \n
    • evaluate_every_n_step: Number of steps to evaluate
    • \n
    • pooling_mode: Pooling mode for your model
    • \n
    • loss_func: Loss function to instantiate model
    • \n
    • model_args: Model arguments to pass
    • \n
    • model_kwargs: Model keyword arguments
    • \n
    • padding_strategy: Tokenization padding strategy
    • \n
    • truncate: Truncate tokenization strategy
    • \n
    • special_tokens: Special tokens to add to the tokenizer
    • \n
    • seed: Dataset shuffle seed
    • \n
    • args:
    • \n
    • kwargs:
    • \n
    \n\n
    Returns
    \n", "signature": "(\tmodel_fp_or_name_or_cls: Union[str, transformers.modeling_utils.PreTrainedModel],\ttokenizer: transformers.tokenization_utils.PreTrainedTokenizer,\toutput_dir: str,\tcompute_metric_fn,\tmetric: datasets.metric.Metric,\tdataset: datasets.dataset_dict.DatasetDict = None,\ttrain_dataset: List[Union[debeir.datasets.types.RelevanceExample, debeir.datasets.types.InputExample, datasets.arrow_dataset.Dataset]] = None,\teval_dataset: List[Union[debeir.datasets.types.RelevanceExample, debeir.datasets.types.InputExample, datasets.arrow_dataset.Dataset]] = None,\ttrain_batch_size=32,\tnum_epochs=3,\tlearning_rate=5e-05,\tlr_scheduler_type: transformers.trainer_utils.SchedulerType = <SchedulerType.CONSTANT_WITH_WARMUP: 'constant_with_warmup'>,\toptimizer: str = 'adamw_hf',\twarmup_ratio=0.1,\tevaluate_every_n_step: int = 1000,\tpooling_mode=None,\tloss_func=None,\tmodel_args=None,\tmodel_kwargs=None,\tpadding_strategy='max_length',\ttruncate=True,\tspecial_tokens=None,\tseed=42,\t*args,\t**kwargs) -> transformers.trainer.Trainer:", "funcdef": "def"}, {"fullname": "debeir.training.utils", "modulename": "debeir.training.utils", "kind": "module", "doc": "

    \n"}, {"fullname": "debeir.training.utils.LoggingScheduler", "modulename": "debeir.training.utils", "qualname": "LoggingScheduler", "kind": "class", "doc": "

    \n"}, {"fullname": "debeir.training.utils.LoggingScheduler.__init__", "modulename": "debeir.training.utils", "qualname": "LoggingScheduler.__init__", "kind": "function", "doc": "

    \n", "signature": "(scheduler: torch.optim.lr_scheduler.LambdaLR)"}, {"fullname": "debeir.training.utils.LoggingScheduler.step", "modulename": "debeir.training.utils", "qualname": "LoggingScheduler.step", "kind": "function", "doc": "

    \n", "signature": "(self, epoch=None):", "funcdef": "def"}, {"fullname": "debeir.training.utils.get_scheduler_with_wandb", "modulename": "debeir.training.utils", "qualname": "get_scheduler_with_wandb", "kind": "function", "doc": "

    Returns the correct learning rate scheduler. Available scheduler: constantlr, warmupconstant, warmuplinear, warmupcosine, warmupcosinewithhardrestarts

    \n", "signature": "(optimizer, scheduler: str, warmup_steps: int, t_total: int):", "funcdef": "def"}, {"fullname": "debeir.training.utils.LoggingLoss", "modulename": "debeir.training.utils", "qualname": "LoggingLoss", "kind": "class", "doc": "

    \n"}, {"fullname": "debeir.training.utils.LoggingLoss.__init__", "modulename": "debeir.training.utils", "qualname": "LoggingLoss.__init__", "kind": "function", "doc": "

    \n", "signature": "(loss_fn)"}, {"fullname": "debeir.training.utils.TokenizerOverload", "modulename": "debeir.training.utils", "qualname": "TokenizerOverload", "kind": "class", "doc": "

    \n"}, {"fullname": "debeir.training.utils.TokenizerOverload.__init__", "modulename": "debeir.training.utils", "qualname": "TokenizerOverload.__init__", "kind": "function", "doc": "

    \n", "signature": "(tokenizer, tokenizer_kwargs, debug=False)"}, {"fullname": "debeir.training.utils.LoggingEvaluator", "modulename": "debeir.training.utils", "qualname": "LoggingEvaluator", "kind": "class", "doc": "

    \n"}, {"fullname": "debeir.training.utils.LoggingEvaluator.__init__", "modulename": "debeir.training.utils", "qualname": "LoggingEvaluator.__init__", "kind": "function", "doc": "

    \n", "signature": "(evaluator)"}, {"fullname": "debeir.training.utils.SentDataset", "modulename": "debeir.training.utils", "qualname": "SentDataset", "kind": "class", "doc": "

    \n"}, {"fullname": "debeir.training.utils.SentDataset.__init__", "modulename": "debeir.training.utils", "qualname": "SentDataset.__init__", "kind": "function", "doc": "

    \n", "signature": "(\tdataset: datasets.arrow_dataset.Dataset,\ttext_cols: List[str],\tlabel_col: str = None,\tlabel=None)"}, {"fullname": "debeir.training.utils.SentDatasetList", "modulename": "debeir.training.utils", "qualname": "SentDatasetList", "kind": "class", "doc": "

    \n"}, {"fullname": "debeir.training.utils.SentDatasetList.__init__", "modulename": "debeir.training.utils", "qualname": "SentDatasetList.__init__", "kind": "function", "doc": "

    \n", "signature": "(datasets: List[debeir.training.utils.SentDataset])"}, {"fullname": "debeir.training.utils.tokenize_function", "modulename": "debeir.training.utils", "qualname": "tokenize_function", "kind": "function", "doc": "

    Tokenizer function

    \n\n
    Parameters
    \n\n
      \n
    • tokenizer: Tokenizer
    • \n
    • examples: Input examples to tokenize
    • \n
    • padding_strategy: Padding strategy
    • \n
    • truncate: Truncate sentences
    • \n
    \n\n
    Returns
    \n\n
    \n
    Returns a list of tokenized examples\n
    \n
    \n", "signature": "(tokenizer, examples, padding_strategy, truncate):", "funcdef": "def"}, {"fullname": "debeir.training.utils.get_max_seq_length", "modulename": "debeir.training.utils", "qualname": "get_max_seq_length", "kind": "function", "doc": "

    \n", "signature": "(tokenizer, dataset, x_labels, dataset_key='train'):", "funcdef": "def"}, {"fullname": "debeir.utils", "modulename": "debeir.utils", "kind": "module", "doc": "

    Common utilities such as score normalization and creating output directory w/ checks

    \n"}, {"fullname": "debeir.utils.scaler", "modulename": "debeir.utils.scaler", "kind": "module", "doc": "

    \n"}, {"fullname": "debeir.utils.scaler.unpack_elasticsearch_scores", "modulename": "debeir.utils.scaler", "qualname": "unpack_elasticsearch_scores", "kind": "function", "doc": "

    Helper function to retrieve the top score of documents for each topic.\nUsed in NIR weight adjustment calculation.

    \n\n
    Parameters
    \n\n
      \n
    • results: Raw input of results from Elasticsearch library
    • \n
    \n\n
    Returns
    \n\n
    \n
    Returns a 1-D dictionary of {topic_num: top_score} pairs.\n
    \n
    \n", "signature": "(results) -> Dict:", "funcdef": "def"}, {"fullname": "debeir.utils.scaler.get_z_value", "modulename": "debeir.utils.scaler", "qualname": "get_z_value", "kind": "function", "doc": "

    Analytical solution for the normalization constant, z, used in NIR log normalization.

    \n\n
    Parameters
    \n\n
      \n
    • cosine_ceiling: The highest theoretical additive cosine score
    • \n
    • bm25_ceiling: The highest BM25 score retrieved from a given topic OR an estimate.
    • \n
    \n\n
    Returns
    \n\n
    \n
    The normalization parameter for NIR log normalization.\n
    \n
    \n", "signature": "(cosine_ceiling, bm25_ceiling) -> float:", "funcdef": "def"}, {"fullname": "debeir.utils.utils", "modulename": "debeir.utils.utils", "kind": "module", "doc": "

    \n"}, {"fullname": "debeir.utils.utils.create_output_file", "modulename": "debeir.utils.utils", "qualname": "create_output_file", "kind": "function", "doc": "

    Create output file based on config instructions

    \n\n
    Parameters
    \n\n
      \n
    • config: The config object with output file options.
    • \n
    • config_fp: The config file path used in default naming options for the output file.
    • \n
    • remove: Overwrites the output file if it exists
    • \n
    • output_file: The output file path if it exists
    • \n
    • output_directory: The output directory used for default naming (specified in nir config)
    • \n
    • kwargs: Compatibility arguments
    • \n
    \n\n
    Returns
    \n", "signature": "(config, config_fp, remove, output_file, output_directory, **kwargs):", "funcdef": "def"}, {"fullname": "debeir.utils.utils.unpack_coroutine", "modulename": "debeir.utils.utils", "qualname": "unpack_coroutine", "kind": "function", "doc": "

    Recursively unwraps co-routines until a result is reached.

    \n\n
    Parameters
    \n\n
      \n
    • f: Wrapped co-routine function.
    • \n
    \n\n
    Returns
    \n\n
    \n
    Results from the (final) evaluated co-routine.\n
    \n
    \n", "signature": "(f):", "funcdef": "async def"}, {"fullname": "debeir.utils.utils.flatten", "modulename": "debeir.utils.utils", "qualname": "flatten", "kind": "function", "doc": "

    Flattens a multidimensional dictionary (dictionary of dictionaries) to a single layer with child keys seperated by\n\"sep\"

    \n\n
    Parameters
    \n\n
      \n
    • d: Multi-level dictionary to flatten.
    • \n
    • parent_key: Prepend a parent_key to all layers.
    • \n
    • sep: Seperator token between child and parent layers.
    • \n
    \n\n
    Returns
    \n\n
    \n
    A flattened 1-D dictionary with keys seperated by *sep*.\n
    \n
    \n", "signature": "(d, parent_key='', sep='_'):", "funcdef": "def"}, {"fullname": "debeir.utils.utils.remove_excess_whitespace", "modulename": "debeir.utils.utils", "qualname": "remove_excess_whitespace", "kind": "function", "doc": "

    \n", "signature": "(s):", "funcdef": "def"}]; // mirrored in build-search-index.js (part 1) // Also split on html tags. this is a cheap heuristic, but good enough.