1 """
2 The DeBEIR (Dense Bi-Encoder Information Retrieval) source code library.
3
-4 See ./main.py in the parent directory for an out-of-the-box runnable code.
+4 See ./examples/ in the parent directory for an out-of-the-box runnable code.
5
6 Otherwise, check out notebooks in the parent directory for training your own model amongst other things.
7 """
diff --git a/docs/debeir/core.html b/docs/debeir/core.html
index b05bd8c..4e665ca 100644
--- a/docs/debeir/core.html
+++ b/docs/debeir/core.html
@@ -57,7 +57,7 @@
Core library interfaces that must be implemented for custom datasets
-
Interfaces to implement custom data_sets in nir.data_sets.
+
Interfaces to implement custom datasets in debeir.datasets .
@@ -67,7 +67,7 @@
1 """
2 Core library interfaces that must be implemented for custom datasets
3
-4 Interfaces to implement custom data_sets in nir.data_sets.
+4 Interfaces to implement custom datasets in debeir.datasets.
5 """
diff --git a/docs/debeir/data_sets.html b/docs/debeir/data_sets.html
deleted file mode 100644
index 813765b..0000000
--- a/docs/debeir/data_sets.html
+++ /dev/null
@@ -1,264 +0,0 @@
-
-
-
-
-
-
- debeir.data_sets API documentation
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
- Contains data_sets implemented from nir.interfaces
-
-
-Parser (For reading data from files into a Dict object)
-Query object (Generating queries)
-
-These query objects can be very lightweight containing only the mappings of the index.
-
-
-
-
-
-
- View Source
-
- 1 """
-2 Contains data_sets implemented from nir.interfaces
-3 1. Parser (For reading data from files into a Dict object)
-4 2. Query object (Generating queries)
-5 - These query objects can be very lightweight containing only the mappings of the index.
-6 """
-
-
-
-
-
-
-
\ No newline at end of file
diff --git a/docs/debeir/data_sets/bioreddit.html b/docs/debeir/data_sets/bioreddit.html
deleted file mode 100644
index e0ba72c..0000000
--- a/docs/debeir/data_sets/bioreddit.html
+++ /dev/null
@@ -1,546 +0,0 @@
-
-
-
-
-
-
- debeir.data_sets.bioreddit API documentation
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
- View Source
-
- 1 from typing import Dict
- 2
- 3 from debeir.interfaces.query import GenericElasticsearchQuery
- 4 from debeir.interfaces.parser import CSVParser
- 5
- 6
- 7 class BioRedditSubmissionParser ( CSVParser ):
- 8 """
- 9 Parser for the BioReddit Submission Dataset
-10 """
-11 parse_fields = [ "id" , "body" ]
-12
-13 @classmethod
-14 def get_topics ( cls , csvfile ) -> Dict [ int , Dict [ str , str ]]:
-15 return super () . get_topics ( csvfile )
-16
-17
-18 class BioRedditCommentParser ( CSVParser ):
-19 """
-20 Parser for the BioReddit Comment Dataset
-21 """
-22 parse_fields = [ "id" , "parent_id" , "selftext" , "title" ]
-23
-24 @classmethod
-25 def get_topics ( cls , csvfile ) -> Dict [ str , Dict [ str , str ]]:
-26 topics = super () . get_topics ( csvfile )
-27 temp = {}
-28
-29 for _ , topic in topics . items ():
-30 topic [ "text" ] = topic . pop ( "selftext" )
-31 topic [ "text2" ] = topic . pop ( "title" )
-32 temp [ topic [ "id" ]] = topic
-33
-34 return temp
-35
-36
-37 class BioRedditElasticsearchQuery ( GenericElasticsearchQuery ):
-38 """
-39 Elasticsearch Query object for the BioReddit
-40 """
-41 def __init__ ( self , topics , config , * args , ** kwargs ):
-42 super () . __init__ ( topics , config , * args , ** kwargs )
-43 self . mappings = [ "Text" ]
-44
-45 self . topics = topics
-46 self . config = config
-47 self . query_type = self . config . query_type
-48
-49 self . embed_mappings = [ "Text_Embedding" ]
-50
-51 self . query_funcs = {
-52 "query" : self . generate_query ,
-53 "embedding" : self . generate_query_embedding ,
-54 }
-
-
-
-
-
-
-
-
- 8 class BioRedditSubmissionParser ( CSVParser ):
- 9 """
-10 Parser for the BioReddit Submission Dataset
-11 """
-12 parse_fields = [ "id" , "body" ]
-13
-14 @classmethod
-15 def get_topics ( cls , csvfile ) -> Dict [ int , Dict [ str , str ]]:
-16 return super () . get_topics ( csvfile )
-
-
-
- Parser for the BioReddit Submission Dataset
-
-
-
-
-
-
-
@classmethod
-
-
def
-
get_topics (cls , csvfile ) -> Dict [ int , Dict [ str , str ]] :
-
-
View Source
-
-
-
-
14 @classmethod
-15 def get_topics ( cls , csvfile ) -> Dict [ int , Dict [ str , str ]]:
-16 return super () . get_topics ( csvfile )
-
-
-
-
Instance method for getting topics, forwards instance self parameters to the _get_topics class method.
-
-
-
-
-
-
Inherited Members
-
-
-
-
-
-
-
-
-
-
-
- 38 class BioRedditElasticsearchQuery ( GenericElasticsearchQuery ):
-39 """
-40 Elasticsearch Query object for the BioReddit
-41 """
-42 def __init__ ( self , topics , config , * args , ** kwargs ):
-43 super () . __init__ ( topics , config , * args , ** kwargs )
-44 self . mappings = [ "Text" ]
-45
-46 self . topics = topics
-47 self . config = config
-48 self . query_type = self . config . query_type
-49
-50 self . embed_mappings = [ "Text_Embedding" ]
-51
-52 self . query_funcs = {
-53 "query" : self . generate_query ,
-54 "embedding" : self . generate_query_embedding ,
-55 }
-
-
-
- Elasticsearch Query object for the BioReddit
-
-
-
-
-
-
-
- BioRedditElasticsearchQuery (topics , config , * args , ** kwargs )
-
- View Source
-
-
-
-
42 def __init__ ( self , topics , config , * args , ** kwargs ):
-43 super () . __init__ ( topics , config , * args , ** kwargs )
-44 self . mappings = [ "Text" ]
-45
-46 self . topics = topics
-47 self . config = config
-48 self . query_type = self . config . query_type
-49
-50 self . embed_mappings = [ "Text_Embedding" ]
-51
-52 self . query_funcs = {
-53 "query" : self . generate_query ,
-54 "embedding" : self . generate_query_embedding ,
-55 }
-
-
-
-
-
-
-
-
Inherited Members
-
-
-
-
-
-
-
-
\ No newline at end of file
diff --git a/docs/debeir/data_sets/clinical_trials.html b/docs/debeir/data_sets/clinical_trials.html
deleted file mode 100644
index 525d0b6..0000000
--- a/docs/debeir/data_sets/clinical_trials.html
+++ /dev/null
@@ -1,2111 +0,0 @@
-
-
-
-
-
-
- debeir.data_sets.clinical_trials API documentation
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
- View Source
-
- 1 import csv
- 2 import loguru
- 3
- 4 from dataclasses import dataclass
- 5 from typing import Dict , Union , Optional , List
- 6 from elasticsearch import AsyncElasticsearch as Elasticsearch
- 7
- 8 from debeir.interfaces.executor import GenericElasticsearchExecutor
- 9 from debeir.interfaces.query import GenericElasticsearchQuery
- 10 from debeir.engines.elasticsearch.generate_script_score import generate_script
- 11
- 12 from debeir.interfaces.config import GenericConfig , apply_config
- 13 from debeir.interfaces.parser import Parser
- 14 from debeir.rankers.transformer_sent_encoder import Encoder
- 15 from debeir.utils.scaler import get_z_value
- 16
- 17
- 18 @dataclass ( init = True , unsafe_hash = True )
- 19 class TrialsQueryConfig ( GenericConfig ):
- 20 query_field_usage : str = None
- 21 embed_field_usage : str = None
- 22 fields : List [ str ] = None
- 23
- 24 def validate ( self ):
- 25 """
- 26 Checks if query type is included, and checks if an encoder is included for embedding queries
- 27 """
- 28 if self . query_type == "embedding" :
- 29 assert self . query_field_usage and self . embed_field_usage , (
- 30 "Must have both field usages" " if embedding query"
- 31 )
- 32 assert (
- 33 self . encoder_fp and self . encoder
- 34 ), "Must provide encoder path for embedding model"
- 35 assert self . norm_weight is not None or self . automatic is not None , (
- 36 "Norm weight be specified or be " "automatic "
- 37 )
- 38
- 39 assert (
- 40 self . query_field_usage is not None or self . fields is not None
- 41 ), "Must have a query field"
- 42 assert self . query_type in [
- 43 "ablation" ,
- 44 "query" ,
- 45 "query_best" ,
- 46 "embedding" ,
- 47 ], "Check your query type"
- 48
- 49 @classmethod
- 50 def from_toml ( cls , fp : str , * args , ** kwargs ) -> "GenericConfig" :
- 51 return super () . from_toml ( fp , cls , * args , ** kwargs )
- 52
- 53 @classmethod
- 54 def from_dict ( cls , ** kwargs ) -> "GenericConfig" :
- 55 return super () . from_dict ( cls , ** kwargs )
- 56
- 57
- 58 class TrialsElasticsearchQuery ( GenericElasticsearchQuery ):
- 59 """
- 60 Elasticsearch Query object for the Clinical Trials Index
- 61 """
- 62 topics : Dict [ int , Dict [ str , str ]]
- 63 query_type : str
- 64 fields : List [ int ]
- 65 query_funcs : Dict
- 66 config : GenericConfig
- 67 id_mapping : str = "_id"
- 68 mappings : List [ str ]
- 69 config : TrialsQueryConfig
- 70
- 71 def __init__ ( self , topics , query_type , config = None , * args , ** kwargs ):
- 72 super () . __init__ ( topics , config , * args , ** kwargs )
- 73 self . query_type = query_type
- 74 self . config = config
- 75 self . topics = topics
- 76 self . fields = []
- 77 self . mappings = [
- 78 "HasExpandedAccess" ,
- 79 "BriefSummary.Textblock" ,
- 80 "CompletionDate.Type" ,
- 81 "OversightInfo.Text" ,
- 82 "OverallContactBackup.PhoneExt" ,
- 83 "RemovedCountries.Text" ,
- 84 "SecondaryOutcome" ,
- 85 "Sponsors.LeadSponsor.Text" ,
- 86 "BriefTitle" ,
- 87 "IDInfo.NctID" ,
- 88 "IDInfo.SecondaryID" ,
- 89 "OverallContactBackup.Phone" ,
- 90 "Eligibility.StudyPop.Textblock" ,
- 91 "DetailedDescription.Textblock" ,
- 92 "Eligibility.MinimumAge" ,
- 93 "Sponsors.Collaborator" ,
- 94 "Reference" ,
- 95 "Eligibility.Criteria.Textblock" ,
- 96 "XMLName.Space" ,
- 97 "Rank" ,
- 98 "OverallStatus" ,
- 99 "InterventionBrowse.Text" ,
-100 "Eligibility.Text" ,
-101 "Intervention" ,
-102 "BiospecDescr.Textblock" ,
-103 "ResponsibleParty.NameTitle" ,
-104 "NumberOfArms" ,
-105 "ResponsibleParty.ResponsiblePartyType" ,
-106 "IsSection801" ,
-107 "Acronym" ,
-108 "Eligibility.MaximumAge" ,
-109 "DetailedDescription.Text" ,
-110 "StudyDesign" ,
-111 "OtherOutcome" ,
-112 "VerificationDate" ,
-113 "ConditionBrowse.MeshTerm" ,
-114 "Enrollment.Text" ,
-115 "IDInfo.Text" ,
-116 "ConditionBrowse.Text" ,
-117 "FirstreceivedDate" ,
-118 "NumberOfGroups" ,
-119 "OversightInfo.HasDmc" ,
-120 "PrimaryCompletionDate.Text" ,
-121 "ResultsReference" ,
-122 "Eligibility.StudyPop.Text" ,
-123 "IsFdaRegulated" ,
-124 "WhyStopped" ,
-125 "ArmGroup" ,
-126 "OverallContact.LastName" ,
-127 "Phase" ,
-128 "RemovedCountries.Country" ,
-129 "InterventionBrowse.MeshTerm" ,
-130 "Eligibility.HealthyVolunteers" ,
-131 "Location" ,
-132 "OfficialTitle" ,
-133 "OverallContact.Email" ,
-134 "RequiredHeader.Text" ,
-135 "RequiredHeader.URL" ,
-136 "LocationCountries.Country" ,
-137 "OverallContact.PhoneExt" ,
-138 "Condition" ,
-139 "PrimaryOutcome" ,
-140 "LocationCountries.Text" ,
-141 "BiospecDescr.Text" ,
-142 "IDInfo.OrgStudyID" ,
-143 "Link" ,
-144 "OverallContact.Phone" ,
-145 "Source" ,
-146 "ResponsibleParty.InvestigatorAffiliation" ,
-147 "StudyType" ,
-148 "FirstreceivedResultsDate" ,
-149 "Enrollment.Type" ,
-150 "Eligibility.Gender" ,
-151 "OverallContactBackup.LastName" ,
-152 "Keyword" ,
-153 "BiospecRetention" ,
-154 "CompletionDate.Text" ,
-155 "OverallContact.Text" ,
-156 "RequiredHeader.DownloadDate" ,
-157 "Sponsors.Text" ,
-158 "Text" ,
-159 "Eligibility.SamplingMethod" ,
-160 "LastchangedDate" ,
-161 "ResponsibleParty.InvestigatorFullName" ,
-162 "StartDate" ,
-163 "RequiredHeader.LinkText" ,
-164 "OverallOfficial" ,
-165 "Sponsors.LeadSponsor.AgencyClass" ,
-166 "OverallContactBackup.Text" ,
-167 "Eligibility.Criteria.Text" ,
-168 "XMLName.Local" ,
-169 "OversightInfo.Authority" ,
-170 "PrimaryCompletionDate.Type" ,
-171 "ResponsibleParty.Organization" ,
-172 "IDInfo.NctAlias" ,
-173 "ResponsibleParty.Text" ,
-174 "TargetDuration" ,
-175 "Sponsors.LeadSponsor.Agency" ,
-176 "BriefSummary.Text" ,
-177 "OverallContactBackup.Email" ,
-178 "ResponsibleParty.InvestigatorTitle" ,
-179 ]
-180
-181 self . best_recall_fields = [
-182 "LocationCountries.Country" ,
-183 "BiospecRetention" ,
-184 "DetailedDescription.Textblock" ,
-185 "HasExpandedAccess" ,
-186 "ConditionBrowse.MeshTerm" ,
-187 "RequiredHeader.LinkText" ,
-188 "WhyStopped" ,
-189 "BriefSummary.Textblock" ,
-190 "Eligibility.Criteria.Textblock" ,
-191 "OfficialTitle" ,
-192 "Eligibility.MaximumAge" ,
-193 "Eligibility.StudyPop.Textblock" ,
-194 "BiospecDescr.Textblock" ,
-195 "BriefTitle" ,
-196 "Eligibility.MinimumAge" ,
-197 "ResponsibleParty.Organization" ,
-198 "TargetDuration" ,
-199 "Condition" ,
-200 "IDInfo.OrgStudyID" ,
-201 "Keyword" ,
-202 "Source" ,
-203 "Sponsors.LeadSponsor.Agency" ,
-204 "ResponsibleParty.InvestigatorAffiliation" ,
-205 "OversightInfo.Authority" ,
-206 "OversightInfo.HasDmc" ,
-207 "OverallContact.Phone" ,
-208 "Phase" ,
-209 "OverallContactBackup.LastName" ,
-210 "Acronym" ,
-211 "InterventionBrowse.MeshTerm" ,
-212 "RemovedCountries.Country" ,
-213 ]
-214 self . best_map_fields = [
-215 "Eligibility.Gender" ,
-216 "LocationCountries.Country" ,
-217 "DetailedDescription.Textblock" ,
-218 "BriefSummary.Textblock" ,
-219 "ConditionBrowse.MeshTerm" ,
-220 "Eligibility.Criteria.Textblock" ,
-221 "InterventionBrowse.MeshTerm" ,
-222 "StudyType" ,
-223 "IsFdaRegulated" ,
-224 "HasExpandedAccess" ,
-225 "RequiredHeader.LinkText" ,
-226 "BiospecRetention" ,
-227 "OfficialTitle" ,
-228 "Eligibility.SamplingMethod" ,
-229 "Eligibility.StudyPop.Textblock" ,
-230 "Condition" ,
-231 "Eligibility.MinimumAge" ,
-232 "Keyword" ,
-233 "Eligibility.MaximumAge" ,
-234 "BriefTitle" ,
-235 ]
-236 self . best_embed_fields = [
-237 "WhyStopped" ,
-238 "HasExpandedAccess" ,
-239 "BiospecRetention" ,
-240 "BriefSummary.Textblock" ,
-241 "LocationCountries.Country" ,
-242 "ConditionBrowse.MeshTerm" ,
-243 "DetailedDescription.Textblock" ,
-244 "RequiredHeader.LinkText" ,
-245 "Eligibility.Criteria.Textblock" ,
-246 ]
-247
-248 self . sensible = [
-249 "BriefSummary.Textblock" "BriefTitle" ,
-250 "Eligibility.StudyPop.Textblock" ,
-251 "DetailedDescription.Textblock" ,
-252 "Eligibility.MinimumAge" ,
-253 "Eligibility.Criteria.Textblock" ,
-254 "InterventionBrowse.Text" ,
-255 "Eligibility.Text" ,
-256 "BiospecDescr.Textblock" ,
-257 "Eligibility.MaximumAge" ,
-258 "DetailedDescription.Text" ,
-259 "ConditionBrowse.MeshTerm" ,
-260 "ConditionBrowse.Text" ,
-261 "Eligibility.StudyPop.Text" ,
-262 "InterventionBrowse.MeshTerm" ,
-263 "OfficialTitle" ,
-264 "Condition" ,
-265 "PrimaryOutcome" ,
-266 "BiospecDescr.Text" ,
-267 "Eligibility.Gender" ,
-268 "Keyword" ,
-269 "BiospecRetention" ,
-270 "Eligibility.Criteria.Text" ,
-271 "BriefSummary.Text" ,
-272 ]
-273
-274 self . sensible_embed = [
-275 "BriefSummary.Textblock" "BriefTitle" ,
-276 "Eligibility.StudyPop.Textblock" ,
-277 "DetailedDescription.Textblock" ,
-278 "Eligibility.Criteria.Textblock" ,
-279 "InterventionBrowse.Text" ,
-280 "Eligibility.Text" ,
-281 "BiospecDescr.Textblock" ,
-282 "DetailedDescription.Text" ,
-283 "ConditionBrowse.MeshTerm" ,
-284 "ConditionBrowse.Text" ,
-285 "Eligibility.StudyPop.Text" ,
-286 "InterventionBrowse.MeshTerm" ,
-287 "OfficialTitle" ,
-288 "Condition" ,
-289 "PrimaryOutcome" ,
-290 "BiospecDescr.Text" ,
-291 "Keyword" ,
-292 "BiospecRetention" ,
-293 "Eligibility.Criteria.Text" ,
-294 "BriefSummary.Text" ,
-295 ]
-296
-297 self . sensible_embed_safe = list (
-298 set ( self . best_recall_fields ) . intersection ( set ( self . sensible_embed ))
-299 )
-300
-301 self . query_funcs = {
-302 "query" : self . generate_query ,
-303 "ablation" : self . generate_query_ablation ,
-304 "embedding" : self . generate_query_embedding ,
-305 }
-306
-307 loguru . logger . debug ( self . sensible_embed_safe )
-308
-309 self . field_usage = {
-310 "best_recall_fields" : self . best_recall_fields ,
-311 "all" : self . mappings ,
-312 "best_map_fields" : self . best_map_fields ,
-313 "best_embed_fields" : self . best_embed_fields ,
-314 "sensible" : self . sensible ,
-315 "sensible_embed" : self . sensible_embed ,
-316 "sensible_embed_safe" : self . sensible_embed_safe ,
-317 }
-318
-319 @apply_config
-320 def generate_query ( self , topic_num , query_field_usage , ** kwargs ) -> Dict :
-321 """
-322 Generates a query for the clinical trials index
-323
-324 :param topic_num: Topic number to search
-325 :param query_field_usage: Which document facets to search over
-326 :param kwargs:
-327 :return:
-328 A basic elasticsearch query for clinical trials
-329 """
-330 fields = self . field_usage [ query_field_usage ]
-331 should = { "should" : []}
-332
-333 qfield = list ( self . topics [ topic_num ] . keys ())[ 0 ]
-334 query = self . topics [ topic_num ][ qfield ]
-335
-336 for i , field in enumerate ( fields ):
-337 should [ "should" ] . append (
-338 {
-339 "match" : {
-340 f " { field } " : {
-341 "query" : query ,
-342 }
-343 }
-344 }
-345 )
-346
-347 query = {
-348 "query" : {
-349 "bool" : should ,
-350 }
-351 }
-352
-353 return query
-354
-355 def generate_query_ablation ( self , topic_num , ** kwargs ):
-356 """
-357 Only search one document facet at a time
-358 :param topic_num:
-359 :param kwargs:
-360 :return:
-361 """
-362 query = { "query" : { "match" : {}}}
-363
-364 for field in self . fields :
-365 query [ "query" ][ "match" ][ self . mappings [ field ]] = ""
-366
-367 for qfield in self . fields :
-368 qfield = self . mappings [ qfield ]
-369 for field in self . topics [ topic_num ]:
-370 query [ "query" ][ "match" ][ qfield ] += self . topics [ topic_num ][ field ]
-371
-372 return query
-373
-374 @apply_config
-375 def generate_query_embedding (
-376 self ,
-377 topic_num ,
-378 encoder ,
-379 query_field_usage ,
-380 embed_field_usage ,
-381 cosine_weights : List [ float ] = None ,
-382 query_weight : List [ float ] = None ,
-383 norm_weight = 2.15 ,
-384 ablations = False ,
-385 automatic_scores = None ,
-386 ** kwargs ,
-387 ):
-388 """
-389 Computes the NIR score for a given topic
-390
-391 Score = log(BM25)/log(norm_weight) + embedding_score
-392
-393 :param topic_num:
-394 :param encoder:
-395 :param query_field_usage:
-396 :param embed_field_usage:
-397 :param cosine_weights:
-398 :param query_weight:
-399 :param norm_weight:
-400 :param ablations:
-401 :param automatic_scores:
-402 :param kwargs:
-403 :return:
-404 """
-405 should = { "should" : []}
-406
-407 assert norm_weight or automatic_scores
-408
-409 query_fields = self . field_usage [ query_field_usage ]
-410 embed_fields = self . field_usage [ embed_field_usage ]
-411
-412 qfield = list ( self . topics [ topic_num ] . keys ())[ 0 ]
-413 query = self . topics [ topic_num ][ qfield ]
-414
-415 for i , field in enumerate ( query_fields ):
-416 should [ "should" ] . append (
-417 {
-418 "match" : {
-419 f " { field } " : {
-420 "query" : query ,
-421 "boost" : query_weight [ i ] if query_weight else 1 ,
-422 }
-423 }
-424 }
-425 )
-426
-427 if automatic_scores is not None :
-428 norm_weight = get_z_value (
-429 cosine_ceiling = len ( embed_fields ) * len ( query_fields ),
-430 bm25_ceiling = automatic_scores [ topic_num ],
-431 )
-432
-433 params = {
-434 "weights" : cosine_weights if cosine_weights else [ 1 ] * len ( embed_fields ),
-435 "q_eb" : encoder . encode ( self . topics [ topic_num ][ qfield ]),
-436 "offset" : 1.0 ,
-437 "norm_weight" : norm_weight ,
-438 "disable_bm25" : ablations ,
-439 }
-440
-441 query = {
-442 "query" : {
-443 "script_score" : {
-444 "query" : {
-445 "bool" : should ,
-446 },
-447 "script" : generate_script ( self . best_embed_fields , params = params ),
-448 },
-449 }
-450 }
-451
-452 return query
-453
-454 def get_query_type ( self , * args , ** kwargs ):
-455 return self . query_funcs [ self . query_type ]( * args , ** kwargs )
-456
-457 def get_id_mapping ( self , hit ):
-458 return hit [ self . id_mapping ]
-459
-460
-461 class ClinicalTrialsElasticsearchExecutor ( GenericElasticsearchExecutor ):
-462 """
-463 Executes queries given a query object.
-464 """
-465 query : TrialsElasticsearchQuery
-466
-467 def __init__ (
-468 self ,
-469 topics : Dict [ Union [ str , int ], Dict [ str , str ]],
-470 client : Elasticsearch ,
-471 index_name : str ,
-472 output_file : str ,
-473 query : TrialsElasticsearchQuery ,
-474 encoder : Optional [ Encoder ] = None ,
-475 config = None ,
-476 * args ,
-477 ** kwargs ,
-478 ):
-479
-480 super () . __init__ (
-481 topics ,
-482 client ,
-483 index_name ,
-484 output_file ,
-485 query ,
-486 encoder ,
-487 config = config ,
-488 * args ,
-489 ** kwargs ,
-490 )
-491
-492 self . query_fns = {
-493 "query" : self . generate_query ,
-494 "ablation" : self . generate_query_ablation ,
-495 "embedding" : self . generate_embedding_query ,
-496 }
-497
-498
-499 class ClinicalTrialParser ( Parser ):
-500 """
-501 Parser for Clinical Trials topics
-502 """
-503 @classmethod
-504 def get_topics ( cls , csvfile ) -> Dict [ int , Dict [ str , str ]]:
-505 topics = {}
-506 reader = csv . reader ( csvfile )
-507 for i , row in enumerate ( reader ):
-508 if i == 0 :
-509 continue
-510
-511 _id = row [ 0 ]
-512 text = row [ 1 ]
-513
-514 topics [ _id ] = { "text" : text }
-515
-516 return topics
-
-
-
-
-
-
-
-
- 19 @dataclass ( init = True , unsafe_hash = True )
-20 class TrialsQueryConfig ( GenericConfig ):
-21 query_field_usage : str = None
-22 embed_field_usage : str = None
-23 fields : List [ str ] = None
-24
-25 def validate ( self ):
-26 """
-27 Checks if query type is included, and checks if an encoder is included for embedding queries
-28 """
-29 if self . query_type == "embedding" :
-30 assert self . query_field_usage and self . embed_field_usage , (
-31 "Must have both field usages" " if embedding query"
-32 )
-33 assert (
-34 self . encoder_fp and self . encoder
-35 ), "Must provide encoder path for embedding model"
-36 assert self . norm_weight is not None or self . automatic is not None , (
-37 "Norm weight be specified or be " "automatic "
-38 )
-39
-40 assert (
-41 self . query_field_usage is not None or self . fields is not None
-42 ), "Must have a query field"
-43 assert self . query_type in [
-44 "ablation" ,
-45 "query" ,
-46 "query_best" ,
-47 "embedding" ,
-48 ], "Check your query type"
-49
-50 @classmethod
-51 def from_toml ( cls , fp : str , * args , ** kwargs ) -> "GenericConfig" :
-52 return super () . from_toml ( fp , cls , * args , ** kwargs )
-53
-54 @classmethod
-55 def from_dict ( cls , ** kwargs ) -> "GenericConfig" :
-56 return super () . from_dict ( cls , ** kwargs )
-
-
-
-
-
-
-
-
- TrialsQueryConfig ( query_type : str , index : str = None , encoder_normalize : bool = True , ablations : bool = False , norm_weight : float = None , automatic : bool = None , encoder : object = None , encoder_fp : str = None , query_weights : List [ float ] = None , cosine_weights : List [ float ] = None , evaluate : bool = False , qrels : str = None , config_fn : str = None , query_fn : str = None , parser_fn : str = None , executor_fn : str = None , cosine_ceiling : float = None , topics_path : str = None , return_id_only : bool = False , overwrite_output_if_exists : bool = False , output_file : str = None , run_name : str = None , query_field_usage : str = None , embed_field_usage : str = None , fields : List [ str ] = None )
-
-
-
-
-
-
-
-
-
-
-
-
- def
- validate (self ):
-
- View Source
-
-
-
-
25 def validate ( self ):
-26 """
-27 Checks if query type is included, and checks if an encoder is included for embedding queries
-28 """
-29 if self . query_type == "embedding" :
-30 assert self . query_field_usage and self . embed_field_usage , (
-31 "Must have both field usages" " if embedding query"
-32 )
-33 assert (
-34 self . encoder_fp and self . encoder
-35 ), "Must provide encoder path for embedding model"
-36 assert self . norm_weight is not None or self . automatic is not None , (
-37 "Norm weight be specified or be " "automatic "
-38 )
-39
-40 assert (
-41 self . query_field_usage is not None or self . fields is not None
-42 ), "Must have a query field"
-43 assert self . query_type in [
-44 "ablation" ,
-45 "query" ,
-46 "query_best" ,
-47 "embedding" ,
-48 ], "Check your query type"
-
-
-
-
Checks if query type is included, and checks if an encoder is included for embedding queries
-
-
-
-
-
-
-
-
-
50 @classmethod
-51 def from_toml ( cls , fp : str , * args , ** kwargs ) -> "GenericConfig" :
-52 return super () . from_toml ( fp , cls , * args , ** kwargs )
-
-
-
-
Instantiates a Config object from a toml file
-
-
Parameters
-
-
-fp : File path of the Config TOML file
-field_class : Class of the Config object to be instantiated
-args : Arguments to be passed to Config
-kwargs : Keyword arguments to be passed
-
-
-
Returns
-
-
-A instantiated and validated Config object.
-
-
-
-
-
-
-
-
-
-
-
54 @classmethod
-55 def from_dict ( cls , ** kwargs ) -> "GenericConfig" :
-56 return super () . from_dict ( cls , ** kwargs )
-
-
-
-
Instantiates a Config object from a dictionary
-
-
Parameters
-
-
-
-
Returns
-
-
-
-
-
-
Inherited Members
-
-
-
-
-
-
-
-
-
- 59 class TrialsElasticsearchQuery ( GenericElasticsearchQuery ):
- 60 """
- 61 Elasticsearch Query object for the Clinical Trials Index
- 62 """
- 63 topics : Dict [ int , Dict [ str , str ]]
- 64 query_type : str
- 65 fields : List [ int ]
- 66 query_funcs : Dict
- 67 config : GenericConfig
- 68 id_mapping : str = "_id"
- 69 mappings : List [ str ]
- 70 config : TrialsQueryConfig
- 71
- 72 def __init__ ( self , topics , query_type , config = None , * args , ** kwargs ):
- 73 super () . __init__ ( topics , config , * args , ** kwargs )
- 74 self . query_type = query_type
- 75 self . config = config
- 76 self . topics = topics
- 77 self . fields = []
- 78 self . mappings = [
- 79 "HasExpandedAccess" ,
- 80 "BriefSummary.Textblock" ,
- 81 "CompletionDate.Type" ,
- 82 "OversightInfo.Text" ,
- 83 "OverallContactBackup.PhoneExt" ,
- 84 "RemovedCountries.Text" ,
- 85 "SecondaryOutcome" ,
- 86 "Sponsors.LeadSponsor.Text" ,
- 87 "BriefTitle" ,
- 88 "IDInfo.NctID" ,
- 89 "IDInfo.SecondaryID" ,
- 90 "OverallContactBackup.Phone" ,
- 91 "Eligibility.StudyPop.Textblock" ,
- 92 "DetailedDescription.Textblock" ,
- 93 "Eligibility.MinimumAge" ,
- 94 "Sponsors.Collaborator" ,
- 95 "Reference" ,
- 96 "Eligibility.Criteria.Textblock" ,
- 97 "XMLName.Space" ,
- 98 "Rank" ,
- 99 "OverallStatus" ,
-100 "InterventionBrowse.Text" ,
-101 "Eligibility.Text" ,
-102 "Intervention" ,
-103 "BiospecDescr.Textblock" ,
-104 "ResponsibleParty.NameTitle" ,
-105 "NumberOfArms" ,
-106 "ResponsibleParty.ResponsiblePartyType" ,
-107 "IsSection801" ,
-108 "Acronym" ,
-109 "Eligibility.MaximumAge" ,
-110 "DetailedDescription.Text" ,
-111 "StudyDesign" ,
-112 "OtherOutcome" ,
-113 "VerificationDate" ,
-114 "ConditionBrowse.MeshTerm" ,
-115 "Enrollment.Text" ,
-116 "IDInfo.Text" ,
-117 "ConditionBrowse.Text" ,
-118 "FirstreceivedDate" ,
-119 "NumberOfGroups" ,
-120 "OversightInfo.HasDmc" ,
-121 "PrimaryCompletionDate.Text" ,
-122 "ResultsReference" ,
-123 "Eligibility.StudyPop.Text" ,
-124 "IsFdaRegulated" ,
-125 "WhyStopped" ,
-126 "ArmGroup" ,
-127 "OverallContact.LastName" ,
-128 "Phase" ,
-129 "RemovedCountries.Country" ,
-130 "InterventionBrowse.MeshTerm" ,
-131 "Eligibility.HealthyVolunteers" ,
-132 "Location" ,
-133 "OfficialTitle" ,
-134 "OverallContact.Email" ,
-135 "RequiredHeader.Text" ,
-136 "RequiredHeader.URL" ,
-137 "LocationCountries.Country" ,
-138 "OverallContact.PhoneExt" ,
-139 "Condition" ,
-140 "PrimaryOutcome" ,
-141 "LocationCountries.Text" ,
-142 "BiospecDescr.Text" ,
-143 "IDInfo.OrgStudyID" ,
-144 "Link" ,
-145 "OverallContact.Phone" ,
-146 "Source" ,
-147 "ResponsibleParty.InvestigatorAffiliation" ,
-148 "StudyType" ,
-149 "FirstreceivedResultsDate" ,
-150 "Enrollment.Type" ,
-151 "Eligibility.Gender" ,
-152 "OverallContactBackup.LastName" ,
-153 "Keyword" ,
-154 "BiospecRetention" ,
-155 "CompletionDate.Text" ,
-156 "OverallContact.Text" ,
-157 "RequiredHeader.DownloadDate" ,
-158 "Sponsors.Text" ,
-159 "Text" ,
-160 "Eligibility.SamplingMethod" ,
-161 "LastchangedDate" ,
-162 "ResponsibleParty.InvestigatorFullName" ,
-163 "StartDate" ,
-164 "RequiredHeader.LinkText" ,
-165 "OverallOfficial" ,
-166 "Sponsors.LeadSponsor.AgencyClass" ,
-167 "OverallContactBackup.Text" ,
-168 "Eligibility.Criteria.Text" ,
-169 "XMLName.Local" ,
-170 "OversightInfo.Authority" ,
-171 "PrimaryCompletionDate.Type" ,
-172 "ResponsibleParty.Organization" ,
-173 "IDInfo.NctAlias" ,
-174 "ResponsibleParty.Text" ,
-175 "TargetDuration" ,
-176 "Sponsors.LeadSponsor.Agency" ,
-177 "BriefSummary.Text" ,
-178 "OverallContactBackup.Email" ,
-179 "ResponsibleParty.InvestigatorTitle" ,
-180 ]
-181
-182 self . best_recall_fields = [
-183 "LocationCountries.Country" ,
-184 "BiospecRetention" ,
-185 "DetailedDescription.Textblock" ,
-186 "HasExpandedAccess" ,
-187 "ConditionBrowse.MeshTerm" ,
-188 "RequiredHeader.LinkText" ,
-189 "WhyStopped" ,
-190 "BriefSummary.Textblock" ,
-191 "Eligibility.Criteria.Textblock" ,
-192 "OfficialTitle" ,
-193 "Eligibility.MaximumAge" ,
-194 "Eligibility.StudyPop.Textblock" ,
-195 "BiospecDescr.Textblock" ,
-196 "BriefTitle" ,
-197 "Eligibility.MinimumAge" ,
-198 "ResponsibleParty.Organization" ,
-199 "TargetDuration" ,
-200 "Condition" ,
-201 "IDInfo.OrgStudyID" ,
-202 "Keyword" ,
-203 "Source" ,
-204 "Sponsors.LeadSponsor.Agency" ,
-205 "ResponsibleParty.InvestigatorAffiliation" ,
-206 "OversightInfo.Authority" ,
-207 "OversightInfo.HasDmc" ,
-208 "OverallContact.Phone" ,
-209 "Phase" ,
-210 "OverallContactBackup.LastName" ,
-211 "Acronym" ,
-212 "InterventionBrowse.MeshTerm" ,
-213 "RemovedCountries.Country" ,
-214 ]
-215 self . best_map_fields = [
-216 "Eligibility.Gender" ,
-217 "LocationCountries.Country" ,
-218 "DetailedDescription.Textblock" ,
-219 "BriefSummary.Textblock" ,
-220 "ConditionBrowse.MeshTerm" ,
-221 "Eligibility.Criteria.Textblock" ,
-222 "InterventionBrowse.MeshTerm" ,
-223 "StudyType" ,
-224 "IsFdaRegulated" ,
-225 "HasExpandedAccess" ,
-226 "RequiredHeader.LinkText" ,
-227 "BiospecRetention" ,
-228 "OfficialTitle" ,
-229 "Eligibility.SamplingMethod" ,
-230 "Eligibility.StudyPop.Textblock" ,
-231 "Condition" ,
-232 "Eligibility.MinimumAge" ,
-233 "Keyword" ,
-234 "Eligibility.MaximumAge" ,
-235 "BriefTitle" ,
-236 ]
-237 self . best_embed_fields = [
-238 "WhyStopped" ,
-239 "HasExpandedAccess" ,
-240 "BiospecRetention" ,
-241 "BriefSummary.Textblock" ,
-242 "LocationCountries.Country" ,
-243 "ConditionBrowse.MeshTerm" ,
-244 "DetailedDescription.Textblock" ,
-245 "RequiredHeader.LinkText" ,
-246 "Eligibility.Criteria.Textblock" ,
-247 ]
-248
-249 self . sensible = [
-250 "BriefSummary.Textblock" "BriefTitle" ,
-251 "Eligibility.StudyPop.Textblock" ,
-252 "DetailedDescription.Textblock" ,
-253 "Eligibility.MinimumAge" ,
-254 "Eligibility.Criteria.Textblock" ,
-255 "InterventionBrowse.Text" ,
-256 "Eligibility.Text" ,
-257 "BiospecDescr.Textblock" ,
-258 "Eligibility.MaximumAge" ,
-259 "DetailedDescription.Text" ,
-260 "ConditionBrowse.MeshTerm" ,
-261 "ConditionBrowse.Text" ,
-262 "Eligibility.StudyPop.Text" ,
-263 "InterventionBrowse.MeshTerm" ,
-264 "OfficialTitle" ,
-265 "Condition" ,
-266 "PrimaryOutcome" ,
-267 "BiospecDescr.Text" ,
-268 "Eligibility.Gender" ,
-269 "Keyword" ,
-270 "BiospecRetention" ,
-271 "Eligibility.Criteria.Text" ,
-272 "BriefSummary.Text" ,
-273 ]
-274
-275 self . sensible_embed = [
-276 "BriefSummary.Textblock" "BriefTitle" ,
-277 "Eligibility.StudyPop.Textblock" ,
-278 "DetailedDescription.Textblock" ,
-279 "Eligibility.Criteria.Textblock" ,
-280 "InterventionBrowse.Text" ,
-281 "Eligibility.Text" ,
-282 "BiospecDescr.Textblock" ,
-283 "DetailedDescription.Text" ,
-284 "ConditionBrowse.MeshTerm" ,
-285 "ConditionBrowse.Text" ,
-286 "Eligibility.StudyPop.Text" ,
-287 "InterventionBrowse.MeshTerm" ,
-288 "OfficialTitle" ,
-289 "Condition" ,
-290 "PrimaryOutcome" ,
-291 "BiospecDescr.Text" ,
-292 "Keyword" ,
-293 "BiospecRetention" ,
-294 "Eligibility.Criteria.Text" ,
-295 "BriefSummary.Text" ,
-296 ]
-297
-298 self . sensible_embed_safe = list (
-299 set ( self . best_recall_fields ) . intersection ( set ( self . sensible_embed ))
-300 )
-301
-302 self . query_funcs = {
-303 "query" : self . generate_query ,
-304 "ablation" : self . generate_query_ablation ,
-305 "embedding" : self . generate_query_embedding ,
-306 }
-307
-308 loguru . logger . debug ( self . sensible_embed_safe )
-309
-310 self . field_usage = {
-311 "best_recall_fields" : self . best_recall_fields ,
-312 "all" : self . mappings ,
-313 "best_map_fields" : self . best_map_fields ,
-314 "best_embed_fields" : self . best_embed_fields ,
-315 "sensible" : self . sensible ,
-316 "sensible_embed" : self . sensible_embed ,
-317 "sensible_embed_safe" : self . sensible_embed_safe ,
-318 }
-319
-320 @apply_config
-321 def generate_query ( self , topic_num , query_field_usage , ** kwargs ) -> Dict :
-322 """
-323 Generates a query for the clinical trials index
-324
-325 :param topic_num: Topic number to search
-326 :param query_field_usage: Which document facets to search over
-327 :param kwargs:
-328 :return:
-329 A basic elasticsearch query for clinical trials
-330 """
-331 fields = self . field_usage [ query_field_usage ]
-332 should = { "should" : []}
-333
-334 qfield = list ( self . topics [ topic_num ] . keys ())[ 0 ]
-335 query = self . topics [ topic_num ][ qfield ]
-336
-337 for i , field in enumerate ( fields ):
-338 should [ "should" ] . append (
-339 {
-340 "match" : {
-341 f " { field } " : {
-342 "query" : query ,
-343 }
-344 }
-345 }
-346 )
-347
-348 query = {
-349 "query" : {
-350 "bool" : should ,
-351 }
-352 }
-353
-354 return query
-355
-356 def generate_query_ablation ( self , topic_num , ** kwargs ):
-357 """
-358 Only search one document facet at a time
-359 :param topic_num:
-360 :param kwargs:
-361 :return:
-362 """
-363 query = { "query" : { "match" : {}}}
-364
-365 for field in self . fields :
-366 query [ "query" ][ "match" ][ self . mappings [ field ]] = ""
-367
-368 for qfield in self . fields :
-369 qfield = self . mappings [ qfield ]
-370 for field in self . topics [ topic_num ]:
-371 query [ "query" ][ "match" ][ qfield ] += self . topics [ topic_num ][ field ]
-372
-373 return query
-374
-375 @apply_config
-376 def generate_query_embedding (
-377 self ,
-378 topic_num ,
-379 encoder ,
-380 query_field_usage ,
-381 embed_field_usage ,
-382 cosine_weights : List [ float ] = None ,
-383 query_weight : List [ float ] = None ,
-384 norm_weight = 2.15 ,
-385 ablations = False ,
-386 automatic_scores = None ,
-387 ** kwargs ,
-388 ):
-389 """
-390 Computes the NIR score for a given topic
-391
-392 Score = log(BM25)/log(norm_weight) + embedding_score
-393
-394 :param topic_num:
-395 :param encoder:
-396 :param query_field_usage:
-397 :param embed_field_usage:
-398 :param cosine_weights:
-399 :param query_weight:
-400 :param norm_weight:
-401 :param ablations:
-402 :param automatic_scores:
-403 :param kwargs:
-404 :return:
-405 """
-406 should = { "should" : []}
-407
-408 assert norm_weight or automatic_scores
-409
-410 query_fields = self . field_usage [ query_field_usage ]
-411 embed_fields = self . field_usage [ embed_field_usage ]
-412
-413 qfield = list ( self . topics [ topic_num ] . keys ())[ 0 ]
-414 query = self . topics [ topic_num ][ qfield ]
-415
-416 for i , field in enumerate ( query_fields ):
-417 should [ "should" ] . append (
-418 {
-419 "match" : {
-420 f " { field } " : {
-421 "query" : query ,
-422 "boost" : query_weight [ i ] if query_weight else 1 ,
-423 }
-424 }
-425 }
-426 )
-427
-428 if automatic_scores is not None :
-429 norm_weight = get_z_value (
-430 cosine_ceiling = len ( embed_fields ) * len ( query_fields ),
-431 bm25_ceiling = automatic_scores [ topic_num ],
-432 )
-433
-434 params = {
-435 "weights" : cosine_weights if cosine_weights else [ 1 ] * len ( embed_fields ),
-436 "q_eb" : encoder . encode ( self . topics [ topic_num ][ qfield ]),
-437 "offset" : 1.0 ,
-438 "norm_weight" : norm_weight ,
-439 "disable_bm25" : ablations ,
-440 }
-441
-442 query = {
-443 "query" : {
-444 "script_score" : {
-445 "query" : {
-446 "bool" : should ,
-447 },
-448 "script" : generate_script ( self . best_embed_fields , params = params ),
-449 },
-450 }
-451 }
-452
-453 return query
-454
-455 def get_query_type ( self , * args , ** kwargs ):
-456 return self . query_funcs [ self . query_type ]( * args , ** kwargs )
-457
-458 def get_id_mapping ( self , hit ):
-459 return hit [ self . id_mapping ]
-
-
-
- Elasticsearch Query object for the Clinical Trials Index
-
-
-
-
-
-
-
- TrialsElasticsearchQuery (topics , query_type , config = None , * args , ** kwargs )
-
- View Source
-
-
-
-
72 def __init__ ( self , topics , query_type , config = None , * args , ** kwargs ):
- 73 super () . __init__ ( topics , config , * args , ** kwargs )
- 74 self . query_type = query_type
- 75 self . config = config
- 76 self . topics = topics
- 77 self . fields = []
- 78 self . mappings = [
- 79 "HasExpandedAccess" ,
- 80 "BriefSummary.Textblock" ,
- 81 "CompletionDate.Type" ,
- 82 "OversightInfo.Text" ,
- 83 "OverallContactBackup.PhoneExt" ,
- 84 "RemovedCountries.Text" ,
- 85 "SecondaryOutcome" ,
- 86 "Sponsors.LeadSponsor.Text" ,
- 87 "BriefTitle" ,
- 88 "IDInfo.NctID" ,
- 89 "IDInfo.SecondaryID" ,
- 90 "OverallContactBackup.Phone" ,
- 91 "Eligibility.StudyPop.Textblock" ,
- 92 "DetailedDescription.Textblock" ,
- 93 "Eligibility.MinimumAge" ,
- 94 "Sponsors.Collaborator" ,
- 95 "Reference" ,
- 96 "Eligibility.Criteria.Textblock" ,
- 97 "XMLName.Space" ,
- 98 "Rank" ,
- 99 "OverallStatus" ,
-100 "InterventionBrowse.Text" ,
-101 "Eligibility.Text" ,
-102 "Intervention" ,
-103 "BiospecDescr.Textblock" ,
-104 "ResponsibleParty.NameTitle" ,
-105 "NumberOfArms" ,
-106 "ResponsibleParty.ResponsiblePartyType" ,
-107 "IsSection801" ,
-108 "Acronym" ,
-109 "Eligibility.MaximumAge" ,
-110 "DetailedDescription.Text" ,
-111 "StudyDesign" ,
-112 "OtherOutcome" ,
-113 "VerificationDate" ,
-114 "ConditionBrowse.MeshTerm" ,
-115 "Enrollment.Text" ,
-116 "IDInfo.Text" ,
-117 "ConditionBrowse.Text" ,
-118 "FirstreceivedDate" ,
-119 "NumberOfGroups" ,
-120 "OversightInfo.HasDmc" ,
-121 "PrimaryCompletionDate.Text" ,
-122 "ResultsReference" ,
-123 "Eligibility.StudyPop.Text" ,
-124 "IsFdaRegulated" ,
-125 "WhyStopped" ,
-126 "ArmGroup" ,
-127 "OverallContact.LastName" ,
-128 "Phase" ,
-129 "RemovedCountries.Country" ,
-130 "InterventionBrowse.MeshTerm" ,
-131 "Eligibility.HealthyVolunteers" ,
-132 "Location" ,
-133 "OfficialTitle" ,
-134 "OverallContact.Email" ,
-135 "RequiredHeader.Text" ,
-136 "RequiredHeader.URL" ,
-137 "LocationCountries.Country" ,
-138 "OverallContact.PhoneExt" ,
-139 "Condition" ,
-140 "PrimaryOutcome" ,
-141 "LocationCountries.Text" ,
-142 "BiospecDescr.Text" ,
-143 "IDInfo.OrgStudyID" ,
-144 "Link" ,
-145 "OverallContact.Phone" ,
-146 "Source" ,
-147 "ResponsibleParty.InvestigatorAffiliation" ,
-148 "StudyType" ,
-149 "FirstreceivedResultsDate" ,
-150 "Enrollment.Type" ,
-151 "Eligibility.Gender" ,
-152 "OverallContactBackup.LastName" ,
-153 "Keyword" ,
-154 "BiospecRetention" ,
-155 "CompletionDate.Text" ,
-156 "OverallContact.Text" ,
-157 "RequiredHeader.DownloadDate" ,
-158 "Sponsors.Text" ,
-159 "Text" ,
-160 "Eligibility.SamplingMethod" ,
-161 "LastchangedDate" ,
-162 "ResponsibleParty.InvestigatorFullName" ,
-163 "StartDate" ,
-164 "RequiredHeader.LinkText" ,
-165 "OverallOfficial" ,
-166 "Sponsors.LeadSponsor.AgencyClass" ,
-167 "OverallContactBackup.Text" ,
-168 "Eligibility.Criteria.Text" ,
-169 "XMLName.Local" ,
-170 "OversightInfo.Authority" ,
-171 "PrimaryCompletionDate.Type" ,
-172 "ResponsibleParty.Organization" ,
-173 "IDInfo.NctAlias" ,
-174 "ResponsibleParty.Text" ,
-175 "TargetDuration" ,
-176 "Sponsors.LeadSponsor.Agency" ,
-177 "BriefSummary.Text" ,
-178 "OverallContactBackup.Email" ,
-179 "ResponsibleParty.InvestigatorTitle" ,
-180 ]
-181
-182 self . best_recall_fields = [
-183 "LocationCountries.Country" ,
-184 "BiospecRetention" ,
-185 "DetailedDescription.Textblock" ,
-186 "HasExpandedAccess" ,
-187 "ConditionBrowse.MeshTerm" ,
-188 "RequiredHeader.LinkText" ,
-189 "WhyStopped" ,
-190 "BriefSummary.Textblock" ,
-191 "Eligibility.Criteria.Textblock" ,
-192 "OfficialTitle" ,
-193 "Eligibility.MaximumAge" ,
-194 "Eligibility.StudyPop.Textblock" ,
-195 "BiospecDescr.Textblock" ,
-196 "BriefTitle" ,
-197 "Eligibility.MinimumAge" ,
-198 "ResponsibleParty.Organization" ,
-199 "TargetDuration" ,
-200 "Condition" ,
-201 "IDInfo.OrgStudyID" ,
-202 "Keyword" ,
-203 "Source" ,
-204 "Sponsors.LeadSponsor.Agency" ,
-205 "ResponsibleParty.InvestigatorAffiliation" ,
-206 "OversightInfo.Authority" ,
-207 "OversightInfo.HasDmc" ,
-208 "OverallContact.Phone" ,
-209 "Phase" ,
-210 "OverallContactBackup.LastName" ,
-211 "Acronym" ,
-212 "InterventionBrowse.MeshTerm" ,
-213 "RemovedCountries.Country" ,
-214 ]
-215 self . best_map_fields = [
-216 "Eligibility.Gender" ,
-217 "LocationCountries.Country" ,
-218 "DetailedDescription.Textblock" ,
-219 "BriefSummary.Textblock" ,
-220 "ConditionBrowse.MeshTerm" ,
-221 "Eligibility.Criteria.Textblock" ,
-222 "InterventionBrowse.MeshTerm" ,
-223 "StudyType" ,
-224 "IsFdaRegulated" ,
-225 "HasExpandedAccess" ,
-226 "RequiredHeader.LinkText" ,
-227 "BiospecRetention" ,
-228 "OfficialTitle" ,
-229 "Eligibility.SamplingMethod" ,
-230 "Eligibility.StudyPop.Textblock" ,
-231 "Condition" ,
-232 "Eligibility.MinimumAge" ,
-233 "Keyword" ,
-234 "Eligibility.MaximumAge" ,
-235 "BriefTitle" ,
-236 ]
-237 self . best_embed_fields = [
-238 "WhyStopped" ,
-239 "HasExpandedAccess" ,
-240 "BiospecRetention" ,
-241 "BriefSummary.Textblock" ,
-242 "LocationCountries.Country" ,
-243 "ConditionBrowse.MeshTerm" ,
-244 "DetailedDescription.Textblock" ,
-245 "RequiredHeader.LinkText" ,
-246 "Eligibility.Criteria.Textblock" ,
-247 ]
-248
-249 self . sensible = [
-250 "BriefSummary.Textblock" "BriefTitle" ,
-251 "Eligibility.StudyPop.Textblock" ,
-252 "DetailedDescription.Textblock" ,
-253 "Eligibility.MinimumAge" ,
-254 "Eligibility.Criteria.Textblock" ,
-255 "InterventionBrowse.Text" ,
-256 "Eligibility.Text" ,
-257 "BiospecDescr.Textblock" ,
-258 "Eligibility.MaximumAge" ,
-259 "DetailedDescription.Text" ,
-260 "ConditionBrowse.MeshTerm" ,
-261 "ConditionBrowse.Text" ,
-262 "Eligibility.StudyPop.Text" ,
-263 "InterventionBrowse.MeshTerm" ,
-264 "OfficialTitle" ,
-265 "Condition" ,
-266 "PrimaryOutcome" ,
-267 "BiospecDescr.Text" ,
-268 "Eligibility.Gender" ,
-269 "Keyword" ,
-270 "BiospecRetention" ,
-271 "Eligibility.Criteria.Text" ,
-272 "BriefSummary.Text" ,
-273 ]
-274
-275 self . sensible_embed = [
-276 "BriefSummary.Textblock" "BriefTitle" ,
-277 "Eligibility.StudyPop.Textblock" ,
-278 "DetailedDescription.Textblock" ,
-279 "Eligibility.Criteria.Textblock" ,
-280 "InterventionBrowse.Text" ,
-281 "Eligibility.Text" ,
-282 "BiospecDescr.Textblock" ,
-283 "DetailedDescription.Text" ,
-284 "ConditionBrowse.MeshTerm" ,
-285 "ConditionBrowse.Text" ,
-286 "Eligibility.StudyPop.Text" ,
-287 "InterventionBrowse.MeshTerm" ,
-288 "OfficialTitle" ,
-289 "Condition" ,
-290 "PrimaryOutcome" ,
-291 "BiospecDescr.Text" ,
-292 "Keyword" ,
-293 "BiospecRetention" ,
-294 "Eligibility.Criteria.Text" ,
-295 "BriefSummary.Text" ,
-296 ]
-297
-298 self . sensible_embed_safe = list (
-299 set ( self . best_recall_fields ) . intersection ( set ( self . sensible_embed ))
-300 )
-301
-302 self . query_funcs = {
-303 "query" : self . generate_query ,
-304 "ablation" : self . generate_query_ablation ,
-305 "embedding" : self . generate_query_embedding ,
-306 }
-307
-308 loguru . logger . debug ( self . sensible_embed_safe )
-309
-310 self . field_usage = {
-311 "best_recall_fields" : self . best_recall_fields ,
-312 "all" : self . mappings ,
-313 "best_map_fields" : self . best_map_fields ,
-314 "best_embed_fields" : self . best_embed_fields ,
-315 "sensible" : self . sensible ,
-316 "sensible_embed" : self . sensible_embed ,
-317 "sensible_embed_safe" : self . sensible_embed_safe ,
-318 }
-
-
-
-
-
-
-
-
-
-
- def
- generate_query (self , * args , ** kwargs ):
-
- View Source
-
-
-
-
231 def use_config ( self , * args , ** kwargs ):
-232 """
-233 Replaces keywords and args passed to the function with ones from self.config.
-234
-235 :param self:
-236 :param args: To be updated
-237 :param kwargs: To be updated
-238 :return:
-239 """
-240 if self . config is not None :
-241 kwargs = self . config . __update__ ( ** kwargs )
-242
-243 return func ( self , * args , ** kwargs )
-
-
-
-
Generates a query for the clinical trials index
-
-
Parameters
-
-
-topic_num : Topic number to search
-query_field_usage : Which document facets to search over
-kwargs :
-
-
-
Returns
-
-
-A basic elasticsearch query for clinical trials
-
-
-
-
-
-
-
-
-
-
- def
- generate_query_ablation (self , topic_num , ** kwargs ):
-
- View Source
-
-
-
-
356 def generate_query_ablation ( self , topic_num , ** kwargs ):
-357 """
-358 Only search one document facet at a time
-359 :param topic_num:
-360 :param kwargs:
-361 :return:
-362 """
-363 query = { "query" : { "match" : {}}}
-364
-365 for field in self . fields :
-366 query [ "query" ][ "match" ][ self . mappings [ field ]] = ""
-367
-368 for qfield in self . fields :
-369 qfield = self . mappings [ qfield ]
-370 for field in self . topics [ topic_num ]:
-371 query [ "query" ][ "match" ][ qfield ] += self . topics [ topic_num ][ field ]
-372
-373 return query
-
-
-
-
Only search one document facet at a time
-
-
Parameters
-
-
-
-
Returns
-
-
-
-
-
-
-
-
- def
- generate_query_embedding (self , * args , ** kwargs ):
-
- View Source
-
-
-
-
231 def use_config ( self , * args , ** kwargs ):
-232 """
-233 Replaces keywords and args passed to the function with ones from self.config.
-234
-235 :param self:
-236 :param args: To be updated
-237 :param kwargs: To be updated
-238 :return:
-239 """
-240 if self . config is not None :
-241 kwargs = self . config . __update__ ( ** kwargs )
-242
-243 return func ( self , * args , ** kwargs )
-
-
-
-
Computes the NIR score for a given topic
-
-
Score = log(BM25)/log(norm_weight) + embedding_score
-
-
Parameters
-
-
-topic_num :
-encoder :
-query_field_usage :
-embed_field_usage :
-cosine_weights :
-query_weight :
-norm_weight :
-ablations :
-automatic_scores :
-kwargs :
-
-
-
Returns
-
-
-
-
-
-
-
-
- def
- get_query_type (self , * args , ** kwargs ):
-
- View Source
-
-
-
-
455 def get_query_type ( self , * args , ** kwargs ):
-456 return self . query_funcs [ self . query_type ]( * args , ** kwargs )
-
-
-
-
-
-
-
-
-
-
- def
- get_id_mapping (self , hit ):
-
- View Source
-
-
-
-
458 def get_id_mapping ( self , hit ):
-459 return hit [ self . id_mapping ]
-
-
-
-
Get the document ID
-
-
Parameters
-
-
-hit : The raw document result
-
-
-
Returns
-
-
-The document's ID
-
-
-
-
-
-
-
-
Inherited Members
-
-
-
-
-
-
-
-
-
- 462 class ClinicalTrialsElasticsearchExecutor ( GenericElasticsearchExecutor ):
-463 """
-464 Executes queries given a query object.
-465 """
-466 query : TrialsElasticsearchQuery
-467
-468 def __init__ (
-469 self ,
-470 topics : Dict [ Union [ str , int ], Dict [ str , str ]],
-471 client : Elasticsearch ,
-472 index_name : str ,
-473 output_file : str ,
-474 query : TrialsElasticsearchQuery ,
-475 encoder : Optional [ Encoder ] = None ,
-476 config = None ,
-477 * args ,
-478 ** kwargs ,
-479 ):
-480
-481 super () . __init__ (
-482 topics ,
-483 client ,
-484 index_name ,
-485 output_file ,
-486 query ,
-487 encoder ,
-488 config = config ,
-489 * args ,
-490 ** kwargs ,
-491 )
-492
-493 self . query_fns = {
-494 "query" : self . generate_query ,
-495 "ablation" : self . generate_query_ablation ,
-496 "embedding" : self . generate_embedding_query ,
-497 }
-
-
-
- Executes queries given a query object.
-
-
-
-
-
-
-
-
ClinicalTrialsElasticsearchExecutor ( topics : Dict [ Union [ str , int ], Dict [ str , str ]] , client : elasticsearch . AsyncElasticsearch , index_name : str , output_file : str , query : debeir.data_sets.clinical_trials.TrialsElasticsearchQuery , encoder : Optional [ debeir.rankers.transformer_sent_encoder.Encoder ] = None , config = None , * args , ** kwargs )
-
-
View Source
-
-
-
-
468 def __init__ (
-469 self ,
-470 topics : Dict [ Union [ str , int ], Dict [ str , str ]],
-471 client : Elasticsearch ,
-472 index_name : str ,
-473 output_file : str ,
-474 query : TrialsElasticsearchQuery ,
-475 encoder : Optional [ Encoder ] = None ,
-476 config = None ,
-477 * args ,
-478 ** kwargs ,
-479 ):
-480
-481 super () . __init__ (
-482 topics ,
-483 client ,
-484 index_name ,
-485 output_file ,
-486 query ,
-487 encoder ,
-488 config = config ,
-489 * args ,
-490 ** kwargs ,
-491 )
-492
-493 self . query_fns = {
-494 "query" : self . generate_query ,
-495 "ablation" : self . generate_query_ablation ,
-496 "embedding" : self . generate_embedding_query ,
-497 }
-
-
-
-
-
-
-
-
Inherited Members
-
-
-
-
-
-
-
-
-
-
- 500 class ClinicalTrialParser ( Parser ):
-501 """
-502 Parser for Clinical Trials topics
-503 """
-504 @classmethod
-505 def get_topics ( cls , csvfile ) -> Dict [ int , Dict [ str , str ]]:
-506 topics = {}
-507 reader = csv . reader ( csvfile )
-508 for i , row in enumerate ( reader ):
-509 if i == 0 :
-510 continue
-511
-512 _id = row [ 0 ]
-513 text = row [ 1 ]
-514
-515 topics [ _id ] = { "text" : text }
-516
-517 return topics
-
-
-
- Parser for Clinical Trials topics
-
-
-
-
-
-
- ClinicalTrialParser ()
-
-
-
-
-
-
-
-
-
-
-
-
@classmethod
-
-
def
-
get_topics (cls , csvfile ) -> Dict [ int , Dict [ str , str ]] :
-
-
View Source
-
-
-
-
504 @classmethod
-505 def get_topics ( cls , csvfile ) -> Dict [ int , Dict [ str , str ]]:
-506 topics = {}
-507 reader = csv . reader ( csvfile )
-508 for i , row in enumerate ( reader ):
-509 if i == 0 :
-510 continue
-511
-512 _id = row [ 0 ]
-513 text = row [ 1 ]
-514
-515 topics [ _id ] = { "text" : text }
-516
-517 return topics
-
-
-
-
Instance method for getting topics, forwards instance self parameters to the _get_topics class method.
-
-
-
-
-
-
Inherited Members
-
-
-
-
-
-
-
-
\ No newline at end of file
diff --git a/docs/debeir/data_sets/factory.html b/docs/debeir/data_sets/factory.html
deleted file mode 100644
index 665b6ea..0000000
--- a/docs/debeir/data_sets/factory.html
+++ /dev/null
@@ -1,683 +0,0 @@
-
-
-
-
-
-
- debeir.data_sets.factory API documentation
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
- View Source
-
- 1 from pathlib import Path
- 2 from typing import Dict , Type , Union
- 3
- 4 import toml
- 5
- 6 from debeir.evaluation.evaluator import Evaluator
- 7 from debeir.evaluation.residual_scoring import ResidualEvaluator
- 8 from debeir.data_sets.trec_clinical_trials import TrecClincialElasticsearchQuery , TrecClinicalTrialsParser
- 9 from debeir.interfaces.config import GenericConfig , _NIRMasterConfig , SolrConfig , ElasticsearchConfig , MetricsConfig , \
- 10 NIRConfig , Config
- 11 from debeir.interfaces.query import GenericElasticsearchQuery , Query
- 12 from debeir.data_sets.clinical_trials import TrialsElasticsearchQuery
- 13 from debeir.data_sets.trec_covid import TrecElasticsearchQuery
- 14
- 15 from debeir.data_sets.clinical_trials import (
- 16 ClinicalTrialsElasticsearchExecutor ,
- 17 ClinicalTrialParser ,
- 18 TrialsQueryConfig ,
- 19 )
- 20 from debeir.data_sets.marco import MarcoElasticsearchExecutor , MarcoQueryConfig
- 21 from debeir.interfaces.executor import GenericElasticsearchExecutor
- 22 from debeir.interfaces.parser import (
- 23 CSVParser , Parser , TSVParser ,
- 24 )
- 25 from debeir.data_sets.bioreddit import BioRedditSubmissionParser , BioRedditCommentParser
- 26 from debeir.data_sets.trec_covid import TrecCovidParser
- 27
- 28 str_to_config_cls = {
- 29 "clinical_trials" : TrialsQueryConfig ,
- 30 "test_trials" : TrialsQueryConfig ,
- 31 "med-marco" : MarcoQueryConfig ,
- 32 "generic" : MarcoQueryConfig ,
- 33 }
- 34
- 35 query_factory = {
- 36 "clinical_trials" : TrialsElasticsearchQuery ,
- 37 "test_trials" : TrialsElasticsearchQuery ,
- 38 "generic" : GenericElasticsearchQuery ,
- 39 "trec_covid" : TrecElasticsearchQuery ,
- 40 "trec_clinical" : TrecClincialElasticsearchQuery ,
- 41 }
- 42
- 43 parser_factory = {
- 44 "trec_covid" : TrecCovidParser ,
- 45 "bioreddit-comment" : BioRedditCommentParser ,
- 46 "bioreddit-submission" : BioRedditSubmissionParser ,
- 47 "test_trials" : ClinicalTrialParser ,
- 48 "med-marco" : CSVParser ,
- 49 "tsv" : TSVParser ,
- 50 "trec_clinical" : TrecClinicalTrialsParser
- 51 }
- 52
- 53 executor_factory = {
- 54 "clinical" : ClinicalTrialsElasticsearchExecutor ,
- 55 "med-marco" : MarcoElasticsearchExecutor ,
- 56 "generic" : GenericElasticsearchExecutor ,
- 57 }
- 58
- 59 evaluator_factory = {
- 60 "residual" : ResidualEvaluator ,
- 61 "trec" : Evaluator ,
- 62 }
- 63
- 64
- 65 def get_index_name ( config_fp ):
- 66 """
- 67 Get the index name from the config without parsing as a TOML
- 68
- 69 :param config_fp:
- 70 :return:
- 71 """
- 72 with open ( config_fp , "r" ) as reader :
- 73 for line in reader :
- 74 if line . startswith ( "index" ):
- 75 line = line . replace ( '"' , "" )
- 76 return line . split ( "=" )[ - 1 ] . strip ()
- 77 return None
- 78
- 79
- 80 def factory_fn ( config_fp , index = None ) -> ( Query , GenericConfig ,
- 81 Parser , GenericElasticsearchExecutor , Evaluator ):
- 82 """
- 83 Factory method for creating the parsed topics, config object, query object and query executor object
- 84
- 85 :param config_fp: Config file path
- 86 :param index: Index to search
- 87 :return:
- 88 Query, Config, Parser, Executor, Evaluator
- 89 """
- 90 config = config_factory ( config_fp )
- 91 assert config . index is not None
- 92 query_cls = query_factory [ config . query_fn ]
- 93 parser = parser_factory [ config . parser_fn ]
- 94 executor = executor_factory [ config . executor_fn ]
- 95
- 96 return query_cls , config , parser , executor
- 97
- 98
- 99 def config_factory ( path : Union [ str , Path ] = None , config_cls : Type [ Config ] = None , args_dict : Dict = None ):
-100 """
-101 Factory method for creating configs
-102
-103 :param path: Config path
-104 :param config_cls: Config class to instantiate
-105 :param args_dict: Arguments to consider
-106 :return:
-107 A config object
-108 """
-109 if path :
-110 args_dict = toml . load ( path )
-111
-112 if not config_cls :
-113 if "config_fn" in args_dict :
-114 config_cls = str_to_config_cls [ args_dict [ "config_fn" ]]
-115 else :
-116 raise NotImplementedError ()
-117
-118 return config_cls . from_args ( args_dict , config_cls )
-119
-120
-121 def get_nir_config ( nir_config , * args , ignore_errors = False , ** kwargs ):
-122 main_config = config_factory ( nir_config , config_cls = _NIRMasterConfig )
-123 search_engine_config = None
-124
-125 supported_search_engines = { "solr" : SolrConfig ,
-126 "elasticsearch" : ElasticsearchConfig }
-127
-128 search_engine_config = None
-129
-130 if 'engine' in kwargs and kwargs [ 'engine' ] in supported_search_engines :
-131 search_engine = kwargs [ 'engine' ]
-132 search_engine_config = config_factory ( args_dict = main_config . get_search_engine_settings ( search_engine ),
-133 config_cls = supported_search_engines [ search_engine ])
-134
-135 #for search_engine in supported_search_engines:
-136 # if search_engine in kwargs and kwargs[search_engine] and kwargs['engine'] == search_engine:
-137 # search_engine_config = config_factory(args_dict=main_config.get_search_engine_settings(search_engine),
-138 # config_cls=supported_search_engines[search_engine])
-139
-140 if not ignore_errors and search_engine_config is None :
-141 raise RuntimeError ( "Unable to get a search engine configuration." )
-142
-143 metrics_config = config_factory ( args_dict = main_config . get_metrics (), config_cls = MetricsConfig )
-144 nir_config = config_factory ( args_dict = main_config . get_nir_settings (), config_cls = NIRConfig )
-145
-146 return nir_config , search_engine_config , metrics_config
-147
-148
-149 def apply_nir_config ( func ):
-150 """
-151 Decorator that applies the NIR config settings to the current function
-152 Replaces arguments and keywords arguments with those found in the config
-153
-154 :param func:
-155 :return:
-156 """
-157
-158 def parse_nir_config ( * args , ignore_errors = False , ** kwargs ):
-159 """
-160 Parses the NIR config for the different setting groups: Search Engine, Metrics and NIR settings
-161 Applies these settings to the current function
-162 :param ignore_errors:
-163 :param args:
-164 :param kwargs:
-165 :return:
-166 """
-167
-168 nir_config , search_engine_config , metrics_config = get_nir_config ( * args ,
-169 ignore_errors ,
-170 ** kwargs )
-171
-172 kwargs = nir_config . __update__ (
-173 ** search_engine_config . __update__ (
-174 ** metrics_config . __update__ ( ** kwargs )
-175 )
-176 )
-177
-178 return func ( * args , ** kwargs )
-179
-180 return parse_nir_config
-
-
-
-
-
-
-
-
- def
- get_index_name (config_fp ):
-
- View Source
-
-
-
- 66 def get_index_name ( config_fp ):
-67 """
-68 Get the index name from the config without parsing as a TOML
-69
-70 :param config_fp:
-71 :return:
-72 """
-73 with open ( config_fp , "r" ) as reader :
-74 for line in reader :
-75 if line . startswith ( "index" ):
-76 line = line . replace ( '"' , "" )
-77 return line . split ( "=" )[ - 1 ] . strip ()
-78 return None
-
-
-
- Get the index name from the config without parsing as a TOML
-
-
Parameters
-
-
-
-
Returns
-
-
-
-
-
-
-
-
- 81 def factory_fn ( config_fp , index = None ) -> ( Query , GenericConfig ,
-82 Parser , GenericElasticsearchExecutor , Evaluator ):
-83 """
-84 Factory method for creating the parsed topics, config object, query object and query executor object
-85
-86 :param config_fp: Config file path
-87 :param index: Index to search
-88 :return:
-89 Query, Config, Parser, Executor, Evaluator
-90 """
-91 config = config_factory ( config_fp )
-92 assert config . index is not None
-93 query_cls = query_factory [ config . query_fn ]
-94 parser = parser_factory [ config . parser_fn ]
-95 executor = executor_factory [ config . executor_fn ]
-96
-97 return query_cls , config , parser , executor
-
-
-
- Factory method for creating the parsed topics, config object, query object and query executor object
-
-
Parameters
-
-
-config_fp : Config file path
-index : Index to search
-
-
-
Returns
-
-
-Query, Config, Parser, Executor, Evaluator
-
-
-
-
-
-
-
-
-
-
-
def
-
config_factory ( path : Union [ str , pathlib . Path ] = None , config_cls : Type [ debeir.interfaces.config.Config ] = None , args_dict : Dict = None ):
-
-
View Source
-
-
-
- 100 def config_factory ( path : Union [ str , Path ] = None , config_cls : Type [ Config ] = None , args_dict : Dict = None ):
-101 """
-102 Factory method for creating configs
-103
-104 :param path: Config path
-105 :param config_cls: Config class to instantiate
-106 :param args_dict: Arguments to consider
-107 :return:
-108 A config object
-109 """
-110 if path :
-111 args_dict = toml . load ( path )
-112
-113 if not config_cls :
-114 if "config_fn" in args_dict :
-115 config_cls = str_to_config_cls [ args_dict [ "config_fn" ]]
-116 else :
-117 raise NotImplementedError ()
-118
-119 return config_cls . from_args ( args_dict , config_cls )
-
-
-
- Factory method for creating configs
-
-
Parameters
-
-
-path : Config path
-config_cls : Config class to instantiate
-args_dict : Arguments to consider
-
-
-
Returns
-
-
-A config object
-
-
-
-
-
-
-
-
-
-
- def
- get_nir_config (nir_config , * args , ignore_errors = False , ** kwargs ):
-
- View Source
-
-
-
- 122 def get_nir_config ( nir_config , * args , ignore_errors = False , ** kwargs ):
-123 main_config = config_factory ( nir_config , config_cls = _NIRMasterConfig )
-124 search_engine_config = None
-125
-126 supported_search_engines = { "solr" : SolrConfig ,
-127 "elasticsearch" : ElasticsearchConfig }
-128
-129 search_engine_config = None
-130
-131 if 'engine' in kwargs and kwargs [ 'engine' ] in supported_search_engines :
-132 search_engine = kwargs [ 'engine' ]
-133 search_engine_config = config_factory ( args_dict = main_config . get_search_engine_settings ( search_engine ),
-134 config_cls = supported_search_engines [ search_engine ])
-135
-136 #for search_engine in supported_search_engines:
-137 # if search_engine in kwargs and kwargs[search_engine] and kwargs['engine'] == search_engine:
-138 # search_engine_config = config_factory(args_dict=main_config.get_search_engine_settings(search_engine),
-139 # config_cls=supported_search_engines[search_engine])
-140
-141 if not ignore_errors and search_engine_config is None :
-142 raise RuntimeError ( "Unable to get a search engine configuration." )
-143
-144 metrics_config = config_factory ( args_dict = main_config . get_metrics (), config_cls = MetricsConfig )
-145 nir_config = config_factory ( args_dict = main_config . get_nir_settings (), config_cls = NIRConfig )
-146
-147 return nir_config , search_engine_config , metrics_config
-
-
-
-
-
-
-
-
-
-
- def
- apply_nir_config (func ):
-
- View Source
-
-
-
- 150 def apply_nir_config ( func ):
-151 """
-152 Decorator that applies the NIR config settings to the current function
-153 Replaces arguments and keywords arguments with those found in the config
-154
-155 :param func:
-156 :return:
-157 """
-158
-159 def parse_nir_config ( * args , ignore_errors = False , ** kwargs ):
-160 """
-161 Parses the NIR config for the different setting groups: Search Engine, Metrics and NIR settings
-162 Applies these settings to the current function
-163 :param ignore_errors:
-164 :param args:
-165 :param kwargs:
-166 :return:
-167 """
-168
-169 nir_config , search_engine_config , metrics_config = get_nir_config ( * args ,
-170 ignore_errors ,
-171 ** kwargs )
-172
-173 kwargs = nir_config . __update__ (
-174 ** search_engine_config . __update__ (
-175 ** metrics_config . __update__ ( ** kwargs )
-176 )
-177 )
-178
-179 return func ( * args , ** kwargs )
-180
-181 return parse_nir_config
-
-
-
- Decorator that applies the NIR config settings to the current function
-Replaces arguments and keywords arguments with those found in the config
-
-
Parameters
-
-
-
-
Returns
-
-
-
-
-
-
-
\ No newline at end of file
diff --git a/docs/debeir/data_sets/marco.html b/docs/debeir/data_sets/marco.html
deleted file mode 100644
index 3b0197f..0000000
--- a/docs/debeir/data_sets/marco.html
+++ /dev/null
@@ -1,771 +0,0 @@
-
-
-
-
-
-
- debeir.data_sets.marco API documentation
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
- View Source
-
- 1 from dataclasses import dataclass
- 2 from typing import Dict , Union , Optional
- 3
- 4 from elasticsearch import AsyncElasticsearch as Elasticsearch
- 5
- 6 from debeir.interfaces.config import GenericConfig
- 7 from debeir.interfaces.executor import GenericElasticsearchExecutor
- 8 from debeir.interfaces.query import GenericElasticsearchQuery
- 9 from debeir.rankers.transformer_sent_encoder import Encoder
-10
-11
-12 class MarcoElasticsearchExecutor ( GenericElasticsearchExecutor ):
-13 query : GenericElasticsearchQuery
-14
-15 def __init__ (
-16 self ,
-17 topics : Dict [ Union [ str , int ], Dict [ str , str ]],
-18 client : Elasticsearch ,
-19 index_name : str ,
-20 output_file : str ,
-21 query : GenericElasticsearchQuery ,
-22 encoder : Optional [ Encoder ] = None ,
-23 config = None ,
-24 * args ,
-25 ** kwargs ,
-26 ):
-27 super () . __init__ (
-28 topics ,
-29 client ,
-30 index_name ,
-31 output_file ,
-32 query ,
-33 encoder ,
-34 config = config ,
-35 * args ,
-36 ** kwargs ,
-37 )
-38
-39 self . query_fns = {
-40 "query" : self . generate_query ,
-41 "embedding" : self . generate_embedding_query ,
-42 }
-43
-44 def generate_query ( self , topic_num , best_fields = True , ** kwargs ):
-45 return self . query . generate_query ( topic_num )
-46
-47 def generate_embedding_query (
-48 self ,
-49 topic_num ,
-50 cosine_weights = None ,
-51 query_weights = None ,
-52 norm_weight = 2.15 ,
-53 automatic_scores = None ,
-54 ** kwargs ,
-55 ):
-56 return super () . generate_embedding_query (
-57 topic_num ,
-58 cosine_weights = cosine_weights ,
-59 query_weights = query_weights ,
-60 norm_weight = 2.15 ,
-61 automatic_scores = None ,
-62 ** kwargs ,
-63 )
-64
-65 async def execute_query (
-66 self , query = None , topic_num = None , ablation = False , query_type = "query" , ** kwargs
-67 ):
-68 return super () . execute_query (
-69 query , topic_num , ablation , query_type = query_type , ** kwargs
-70 )
-71
-72
-73 @dataclass ( init = True , unsafe_hash = True )
-74 class MarcoQueryConfig ( GenericConfig ):
-75 def validate ( self ):
-76 if self . query_type == "embedding" :
-77 assert (
-78 self . encoder_fp and self . encoder
-79 ), "Must provide encoder path for embedding model"
-80 assert self . norm_weight is not None or self . automatic is not None , (
-81 "Norm weight be " "specified or be automatic"
-82 )
-83
-84 @classmethod
-85 def from_toml ( cls , fp : str , * args , ** kwargs ) -> "MarcoQueryConfig" :
-86 return super () . from_toml ( fp , cls , * args , ** kwargs )
-87
-88 @classmethod
-89 def from_dict ( cls , ** kwargs ) -> "MarcoQueryConfig" :
-90 return super () . from_dict ( cls , ** kwargs )
-
-
-
-
-
-
-
-
- 13 class MarcoElasticsearchExecutor ( GenericElasticsearchExecutor ):
-14 query : GenericElasticsearchQuery
-15
-16 def __init__ (
-17 self ,
-18 topics : Dict [ Union [ str , int ], Dict [ str , str ]],
-19 client : Elasticsearch ,
-20 index_name : str ,
-21 output_file : str ,
-22 query : GenericElasticsearchQuery ,
-23 encoder : Optional [ Encoder ] = None ,
-24 config = None ,
-25 * args ,
-26 ** kwargs ,
-27 ):
-28 super () . __init__ (
-29 topics ,
-30 client ,
-31 index_name ,
-32 output_file ,
-33 query ,
-34 encoder ,
-35 config = config ,
-36 * args ,
-37 ** kwargs ,
-38 )
-39
-40 self . query_fns = {
-41 "query" : self . generate_query ,
-42 "embedding" : self . generate_embedding_query ,
-43 }
-44
-45 def generate_query ( self , topic_num , best_fields = True , ** kwargs ):
-46 return self . query . generate_query ( topic_num )
-47
-48 def generate_embedding_query (
-49 self ,
-50 topic_num ,
-51 cosine_weights = None ,
-52 query_weights = None ,
-53 norm_weight = 2.15 ,
-54 automatic_scores = None ,
-55 ** kwargs ,
-56 ):
-57 return super () . generate_embedding_query (
-58 topic_num ,
-59 cosine_weights = cosine_weights ,
-60 query_weights = query_weights ,
-61 norm_weight = 2.15 ,
-62 automatic_scores = None ,
-63 ** kwargs ,
-64 )
-65
-66 async def execute_query (
-67 self , query = None , topic_num = None , ablation = False , query_type = "query" , ** kwargs
-68 ):
-69 return super () . execute_query (
-70 query , topic_num , ablation , query_type = query_type , ** kwargs
-71 )
-
-
-
- Generic Executor class for Elasticsearch
-
-
-
-
-
-
-
-
MarcoElasticsearchExecutor ( topics : Dict [ Union [ str , int ], Dict [ str , str ]] , client : elasticsearch . AsyncElasticsearch , index_name : str , output_file : str , query : debeir.interfaces.query.GenericElasticsearchQuery , encoder : Optional [ debeir.rankers.transformer_sent_encoder.Encoder ] = None , config = None , * args , ** kwargs )
-
-
View Source
-
-
-
-
16 def __init__ (
-17 self ,
-18 topics : Dict [ Union [ str , int ], Dict [ str , str ]],
-19 client : Elasticsearch ,
-20 index_name : str ,
-21 output_file : str ,
-22 query : GenericElasticsearchQuery ,
-23 encoder : Optional [ Encoder ] = None ,
-24 config = None ,
-25 * args ,
-26 ** kwargs ,
-27 ):
-28 super () . __init__ (
-29 topics ,
-30 client ,
-31 index_name ,
-32 output_file ,
-33 query ,
-34 encoder ,
-35 config = config ,
-36 * args ,
-37 ** kwargs ,
-38 )
-39
-40 self . query_fns = {
-41 "query" : self . generate_query ,
-42 "embedding" : self . generate_embedding_query ,
-43 }
-
-
-
-
-
-
-
-
-
-
- def
- generate_query (self , topic_num , best_fields = True , ** kwargs ):
-
- View Source
-
-
-
-
45 def generate_query ( self , topic_num , best_fields = True , ** kwargs ):
-46 return self . query . generate_query ( topic_num )
-
-
-
-
Generates a standard BM25 query given the topic number
-
-
Parameters
-
-
-topic_num : Query topic number to generate
-best_fields : Whether to use a curated list of fields
-kwargs :
-
-
-
Returns
-
-
-
-
-
-
-
-
- def
- generate_embedding_query ( self , topic_num , cosine_weights = None , query_weights = None , norm_weight = 2.15 , automatic_scores = None , ** kwargs ):
-
- View Source
-
-
-
-
48 def generate_embedding_query (
-49 self ,
-50 topic_num ,
-51 cosine_weights = None ,
-52 query_weights = None ,
-53 norm_weight = 2.15 ,
-54 automatic_scores = None ,
-55 ** kwargs ,
-56 ):
-57 return super () . generate_embedding_query (
-58 topic_num ,
-59 cosine_weights = cosine_weights ,
-60 query_weights = query_weights ,
-61 norm_weight = 2.15 ,
-62 automatic_scores = None ,
-63 ** kwargs ,
-64 )
-
-
-
-
Executes an NIR-style query with combined scoring.
-
-
Parameters
-
-
-topic_num :
-cosine_weights :
-query_weights :
-norm_weight :
-automatic_scores :
-kwargs :
-
-
-
Returns
-
-
-
-
-
-
-
-
- async def
- execute_query ( self , query = None , topic_num = None , ablation = False , query_type = 'query' , ** kwargs ):
-
- View Source
-
-
-
-
66 async def execute_query (
-67 self , query = None , topic_num = None , ablation = False , query_type = "query" , ** kwargs
-68 ):
-69 return super () . execute_query (
-70 query , topic_num , ablation , query_type = query_type , ** kwargs
-71 )
-
-
-
-
Execute a query given parameters
-
-
Parameters
-
-
-
-
-
-
-
-
Inherited Members
-
-
-
-
-
-
-
-
-
-
- 74 @dataclass ( init = True , unsafe_hash = True )
-75 class MarcoQueryConfig ( GenericConfig ):
-76 def validate ( self ):
-77 if self . query_type == "embedding" :
-78 assert (
-79 self . encoder_fp and self . encoder
-80 ), "Must provide encoder path for embedding model"
-81 assert self . norm_weight is not None or self . automatic is not None , (
-82 "Norm weight be " "specified or be automatic"
-83 )
-84
-85 @classmethod
-86 def from_toml ( cls , fp : str , * args , ** kwargs ) -> "MarcoQueryConfig" :
-87 return super () . from_toml ( fp , cls , * args , ** kwargs )
-88
-89 @classmethod
-90 def from_dict ( cls , ** kwargs ) -> "MarcoQueryConfig" :
-91 return super () . from_dict ( cls , ** kwargs )
-
-
-
-
-
-
-
-
- MarcoQueryConfig ( query_type : str , index : str = None , encoder_normalize : bool = True , ablations : bool = False , norm_weight : float = None , automatic : bool = None , encoder : object = None , encoder_fp : str = None , query_weights : List [ float ] = None , cosine_weights : List [ float ] = None , evaluate : bool = False , qrels : str = None , config_fn : str = None , query_fn : str = None , parser_fn : str = None , executor_fn : str = None , cosine_ceiling : float = None , topics_path : str = None , return_id_only : bool = False , overwrite_output_if_exists : bool = False , output_file : str = None , run_name : str = None )
-
-
-
-
-
-
-
-
-
-
-
-
- def
- validate (self ):
-
- View Source
-
-
-
-
76 def validate ( self ):
-77 if self . query_type == "embedding" :
-78 assert (
-79 self . encoder_fp and self . encoder
-80 ), "Must provide encoder path for embedding model"
-81 assert self . norm_weight is not None or self . automatic is not None , (
-82 "Norm weight be " "specified or be automatic"
-83 )
-
-
-
-
Validates if the config is correct.
-Must be implemented by inherited classes.
-
-
-
-
-
-
-
-
-
85 @classmethod
-86 def from_toml ( cls , fp : str , * args , ** kwargs ) -> "MarcoQueryConfig" :
-87 return super () . from_toml ( fp , cls , * args , ** kwargs )
-
-
-
-
Instantiates a Config object from a toml file
-
-
Parameters
-
-
-fp : File path of the Config TOML file
-field_class : Class of the Config object to be instantiated
-args : Arguments to be passed to Config
-kwargs : Keyword arguments to be passed
-
-
-
Returns
-
-
-A instantiated and validated Config object.
-
-
-
-
-
-
-
-
-
-
-
89 @classmethod
-90 def from_dict ( cls , ** kwargs ) -> "MarcoQueryConfig" :
-91 return super () . from_dict ( cls , ** kwargs )
-
-
-
-
Instantiates a Config object from a dictionary
-
-
Parameters
-
-
-
-
Returns
-
-
-
-
-
-
Inherited Members
-
-
-
-
-
-
-
-
\ No newline at end of file
diff --git a/docs/debeir/data_sets/trec_clinical_trials.html b/docs/debeir/data_sets/trec_clinical_trials.html
deleted file mode 100644
index 94b5864..0000000
--- a/docs/debeir/data_sets/trec_clinical_trials.html
+++ /dev/null
@@ -1,611 +0,0 @@
-
-
-
-
-
-
- debeir.data_sets.trec_clinical_trials API documentation
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
- View Source
-
- 1 import pathlib
- 2 import xml.etree.ElementTree as ET
- 3 from collections import defaultdict
- 4 from typing import Dict , List
- 5
- 6 import pandas as pd
- 7
- 8 from debeir.interfaces.query import GenericElasticsearchQuery
- 9 from debeir.interfaces.parser import XMLParser , JsonLinesParser
- 10
- 11
- 12 class TREClinicalTrialDocumentParser ( XMLParser ):
- 13 """
- 14 Parser for Clinical Trials topics
- 15 """
- 16
- 17 parse_fields : List [ str ] = [ "brief_title" , "official_title" ,
- 18 "brief_summary" , "detailed_description" ,
- 19 "eligibility" , "condition_browse" ,
- 20 "intervention_browse" ]
- 21 topic_field_name : str
- 22 id_field : str
- 23
- 24 @classmethod
- 25 def extract ( cls , path ) -> Dict :
- 26 document = ET . parse ( path ) . getroot ()
- 27 document_dict = defaultdict ( lambda : defaultdict ( lambda : []))
- 28 document_dict [ 'doc_id' ] = pathlib . Path ( path ) . parts [ - 1 ] . strip ( ".xml" )
- 29
- 30 for parse_field in cls . parse_fields :
- 31 node = document . find ( parse_field )
- 32 nodes : List [ ET . Element ] = []
- 33
- 34 if node is not None :
- 35 cls . _recurse_to_child_node ( node , nodes )
- 36
- 37 if len ( nodes ) == 0 and node is not None :
- 38 document_dict [ parse_field ] = node . text
- 39
- 40 for node in nodes :
- 41 text = node . text . strip ()
- 42
- 43 if not text :
- 44 continue
- 45
- 46 if document_dict [ parse_field ][ node . tag ]:
- 47 document_dict [ parse_field ][ node . tag ] . append ( text )
- 48 else :
- 49 document_dict [ parse_field ][ node . tag ] = [ text ]
- 50
- 51 cls . unwrap ( document_dict , parse_field )
- 52
- 53 document_dict = pd . io . json . json_normalize ( document_dict ,
- 54 sep = "." ) . to_dict ( orient = 'records' )[ 0 ]
- 55
- 56 return document_dict
- 57
- 58
- 59 TrecClinicalTrialTripletParser = JsonLinesParser (
- 60 parse_fields = [ "q_text" , "brief_title" , "official_title" ,
- 61 "brief_summary" , "detailed_description" , "rel" ],
- 62 id_field = "qid" ,
- 63 secondary_id = "doc_id" ,
- 64 ignore_full_match = True
- 65 )
- 66
- 67 TrecClinicalTrialsParser = XMLParser (
- 68 parse_fields = None ,
- 69 id_field = "number" ,
- 70 topic_field_name = "topic" )
- 71
- 72
- 73 class TrecClincialElasticsearchQuery ( GenericElasticsearchQuery ):
- 74 def __init__ ( self , topics , config , * args , ** kwargs ):
- 75 super () . __init__ ( topics , config , * args , ** kwargs )
- 76
- 77 #self.mappings = ['BriefTitle_Text',
- 78 # 'BriefSummary_Text',
- 79 # 'DetailedDescription_Text']
- 80
- 81 self . mappings = [
- 82 "BriefSummary_Text" ,
- 83 "BriefTitle_Text" ,
- 84 'DetailedDescription_Text' ,
- 85 'Eligibility.Criteria.Textblock'
- 86 'Eligibility.StudyPop.Textblock' ,
- 87 'ConditionBrowse.MeshTerm' ,
- 88 'InterventionBrowse.MeshTerm' ,
- 89 'Condition' ,
- 90 'Eligibility.Gender' ,
- 91 "OfficialTitle" ]
- 92
- 93 self . topics = topics
- 94 self . config = config
- 95 self . query_type = self . config . query_type
- 96
- 97 self . embed_mappings = [ 'BriefTitle_Embedding' ,
- 98 'BriefSummary_Embedding' ,
- 99 'DetailedDescription_Embedding' ]
-100
-101 self . id_mapping = "docid"
-102
-103 self . query_funcs = {
-104 "query" : self . generate_query ,
-105 "embedding" : self . generate_query_embedding ,
-106 }
-
-
-
-
-
-
-
-
- 13 class TREClinicalTrialDocumentParser ( XMLParser ):
-14 """
-15 Parser for Clinical Trials topics
-16 """
-17
-18 parse_fields : List [ str ] = [ "brief_title" , "official_title" ,
-19 "brief_summary" , "detailed_description" ,
-20 "eligibility" , "condition_browse" ,
-21 "intervention_browse" ]
-22 topic_field_name : str
-23 id_field : str
-24
-25 @classmethod
-26 def extract ( cls , path ) -> Dict :
-27 document = ET . parse ( path ) . getroot ()
-28 document_dict = defaultdict ( lambda : defaultdict ( lambda : []))
-29 document_dict [ 'doc_id' ] = pathlib . Path ( path ) . parts [ - 1 ] . strip ( ".xml" )
-30
-31 for parse_field in cls . parse_fields :
-32 node = document . find ( parse_field )
-33 nodes : List [ ET . Element ] = []
-34
-35 if node is not None :
-36 cls . _recurse_to_child_node ( node , nodes )
-37
-38 if len ( nodes ) == 0 and node is not None :
-39 document_dict [ parse_field ] = node . text
-40
-41 for node in nodes :
-42 text = node . text . strip ()
-43
-44 if not text :
-45 continue
-46
-47 if document_dict [ parse_field ][ node . tag ]:
-48 document_dict [ parse_field ][ node . tag ] . append ( text )
-49 else :
-50 document_dict [ parse_field ][ node . tag ] = [ text ]
-51
-52 cls . unwrap ( document_dict , parse_field )
-53
-54 document_dict = pd . io . json . json_normalize ( document_dict ,
-55 sep = "." ) . to_dict ( orient = 'records' )[ 0 ]
-56
-57 return document_dict
-
-
-
- Parser for Clinical Trials topics
-
-
-
-
-
-
Inherited Members
-
-
-
-
-
-
-
-
-
-
- 74 class TrecClincialElasticsearchQuery ( GenericElasticsearchQuery ):
- 75 def __init__ ( self , topics , config , * args , ** kwargs ):
- 76 super () . __init__ ( topics , config , * args , ** kwargs )
- 77
- 78 #self.mappings = ['BriefTitle_Text',
- 79 # 'BriefSummary_Text',
- 80 # 'DetailedDescription_Text']
- 81
- 82 self . mappings = [
- 83 "BriefSummary_Text" ,
- 84 "BriefTitle_Text" ,
- 85 'DetailedDescription_Text' ,
- 86 'Eligibility.Criteria.Textblock'
- 87 'Eligibility.StudyPop.Textblock' ,
- 88 'ConditionBrowse.MeshTerm' ,
- 89 'InterventionBrowse.MeshTerm' ,
- 90 'Condition' ,
- 91 'Eligibility.Gender' ,
- 92 "OfficialTitle" ]
- 93
- 94 self . topics = topics
- 95 self . config = config
- 96 self . query_type = self . config . query_type
- 97
- 98 self . embed_mappings = [ 'BriefTitle_Embedding' ,
- 99 'BriefSummary_Embedding' ,
-100 'DetailedDescription_Embedding' ]
-101
-102 self . id_mapping = "docid"
-103
-104 self . query_funcs = {
-105 "query" : self . generate_query ,
-106 "embedding" : self . generate_query_embedding ,
-107 }
-
-
-
- A generic elasticsearch query. Contains methods for NIR-style (embedding) queries and normal BM25 queries.
-Requires topics, configs to be included
-
-
-
-
-
-
-
- TrecClincialElasticsearchQuery (topics , config , * args , ** kwargs )
-
- View Source
-
-
-
-
75 def __init__ ( self , topics , config , * args , ** kwargs ):
- 76 super () . __init__ ( topics , config , * args , ** kwargs )
- 77
- 78 #self.mappings = ['BriefTitle_Text',
- 79 # 'BriefSummary_Text',
- 80 # 'DetailedDescription_Text']
- 81
- 82 self . mappings = [
- 83 "BriefSummary_Text" ,
- 84 "BriefTitle_Text" ,
- 85 'DetailedDescription_Text' ,
- 86 'Eligibility.Criteria.Textblock'
- 87 'Eligibility.StudyPop.Textblock' ,
- 88 'ConditionBrowse.MeshTerm' ,
- 89 'InterventionBrowse.MeshTerm' ,
- 90 'Condition' ,
- 91 'Eligibility.Gender' ,
- 92 "OfficialTitle" ]
- 93
- 94 self . topics = topics
- 95 self . config = config
- 96 self . query_type = self . config . query_type
- 97
- 98 self . embed_mappings = [ 'BriefTitle_Embedding' ,
- 99 'BriefSummary_Embedding' ,
-100 'DetailedDescription_Embedding' ]
-101
-102 self . id_mapping = "docid"
-103
-104 self . query_funcs = {
-105 "query" : self . generate_query ,
-106 "embedding" : self . generate_query_embedding ,
-107 }
-
-
-
-
-
-
-
-
Inherited Members
-
-
-
-
-
-
-
-
\ No newline at end of file
diff --git a/docs/debeir/data_sets/trec_covid.html b/docs/debeir/data_sets/trec_covid.html
deleted file mode 100644
index d590bfe..0000000
--- a/docs/debeir/data_sets/trec_covid.html
+++ /dev/null
@@ -1,453 +0,0 @@
-
-
-
-
-
-
- debeir.data_sets.trec_covid API documentation
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
- View Source
-
- 1 from typing import Dict
- 2
- 3 from debeir.interfaces.query import GenericElasticsearchQuery
- 4 from debeir.interfaces.parser import XMLParser
- 5
- 6
- 7 class TrecCovidParser ( XMLParser ):
- 8 parse_fields = [ "query" , "question" , "narrative" ]
- 9 topic_field_name = "topic"
-10 id_field = "number"
-11
-12 @classmethod
-13 def get_topics ( cls , xmlfile ) -> Dict [ int , Dict [ str , str ]]:
-14 return super () . get_topics ( xmlfile )
-15
-16
-17 class TrecElasticsearchQuery ( GenericElasticsearchQuery ):
-18 def __init__ ( self , topics , config , * args , ** kwargs ):
-19 super () . __init__ ( topics , config , * args , ** kwargs )
-20
-21 self . mappings = [ "title" , "abstract" , "fulltext" ]
-22
-23 self . topics = topics
-24 self . config = config
-25 self . query_type = self . config . query_type
-26
-27 self . embed_mappings = [
-28 "title_embedding" ,
-29 "abstract_embedding" ,
-30 "fulltext_embedding" ,
-31 ]
-32
-33 self . id_mapping = "id"
-34
-35 self . query_funcs = {
-36 "query" : self . generate_query ,
-37 "embedding" : self . generate_query_embedding ,
-38 }
-
-
-
-
-
-
-
-
- 8 class TrecCovidParser ( XMLParser ):
- 9 parse_fields = [ "query" , "question" , "narrative" ]
-10 topic_field_name = "topic"
-11 id_field = "number"
-12
-13 @classmethod
-14 def get_topics ( cls , xmlfile ) -> Dict [ int , Dict [ str , str ]]:
-15 return super () . get_topics ( xmlfile )
-
-
-
- Load topics from an XML file
-
-
-
-
-
-
-
@classmethod
-
-
def
-
get_topics (cls , xmlfile ) -> Dict [ int , Dict [ str , str ]] :
-
-
View Source
-
-
-
-
13 @classmethod
-14 def get_topics ( cls , xmlfile ) -> Dict [ int , Dict [ str , str ]]:
-15 return super () . get_topics ( xmlfile )
-
-
-
-
Instance method for getting topics, forwards instance self parameters to the _get_topics class method.
-
-
-
-
-
-
Inherited Members
-
-
-
-
-
-
-
-
-
-
- 18 class TrecElasticsearchQuery ( GenericElasticsearchQuery ):
-19 def __init__ ( self , topics , config , * args , ** kwargs ):
-20 super () . __init__ ( topics , config , * args , ** kwargs )
-21
-22 self . mappings = [ "title" , "abstract" , "fulltext" ]
-23
-24 self . topics = topics
-25 self . config = config
-26 self . query_type = self . config . query_type
-27
-28 self . embed_mappings = [
-29 "title_embedding" ,
-30 "abstract_embedding" ,
-31 "fulltext_embedding" ,
-32 ]
-33
-34 self . id_mapping = "id"
-35
-36 self . query_funcs = {
-37 "query" : self . generate_query ,
-38 "embedding" : self . generate_query_embedding ,
-39 }
-
-
-
- A generic elasticsearch query. Contains methods for NIR-style (embedding) queries and normal BM25 queries.
-Requires topics, configs to be included
-
-
-
-
-
-
-
- TrecElasticsearchQuery (topics , config , * args , ** kwargs )
-
- View Source
-
-
-
-
19 def __init__ ( self , topics , config , * args , ** kwargs ):
-20 super () . __init__ ( topics , config , * args , ** kwargs )
-21
-22 self . mappings = [ "title" , "abstract" , "fulltext" ]
-23
-24 self . topics = topics
-25 self . config = config
-26 self . query_type = self . config . query_type
-27
-28 self . embed_mappings = [
-29 "title_embedding" ,
-30 "abstract_embedding" ,
-31 "fulltext_embedding" ,
-32 ]
-33
-34 self . id_mapping = "id"
-35
-36 self . query_funcs = {
-37 "query" : self . generate_query ,
-38 "embedding" : self . generate_query_embedding ,
-39 }
-
-
-
-
-
-
-
-
Inherited Members
-
-
-
-
-
-
-
-
\ No newline at end of file
diff --git a/docs/debeir/data_sets/types.html b/docs/debeir/data_sets/types.html
deleted file mode 100644
index 2f5ec9f..0000000
--- a/docs/debeir/data_sets/types.html
+++ /dev/null
@@ -1,731 +0,0 @@
-
-
-
-
-
-
- debeir.data_sets.types API documentation
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
- View Source
-
- 1 import string
- 2 from collections import defaultdict
- 3 from enum import Enum
- 4 from typing import List , Union
- 5
- 6
- 7 class InputExample :
- 8 """
- 9 Copied from Sentence Transformer Library
-10 Structure for one input example with texts, the label and a unique id
-11 """
-12
-13 def __init__ ( self , guid : str = '' , texts : List [ str ] = None , label : Union [ int , float ] = 0 ):
-14 """
-15 Creates one InputExample with the given texts, guid and label
-16
-17 :param guid
-18 id for the example
-19 :param texts
-20 the texts for the example. Note, str.strip() is called on the texts
-21 :param label
-22 the label for the example
-23 """
-24 self . guid = guid
-25 self . texts = [ text . strip () for text in texts ]
-26 self . label = label
-27
-28 def __str__ ( self ):
-29 return "<InputExample> label: {} , texts: {} " . format ( str ( self . label ), "; " . join ( self . texts ))
-30
-31 def get_label ( self ):
-32 return self . label
-33
-34 #def __getattr__(self, key):
-35 # if key == "label":
-36 # return self.get_label()
-37
-38 # if key == "texts":
-39 # return self.texts
-40
-41 # if key in ["guid", "id"]:
-42 # return self.guid
-43
-44 # raise KeyError()
-45
-46 @classmethod
-47 def to_dict ( cls , data : List [ 'InputExample' ]):
-48 text_len = len ( data [ 0 ] . texts )
-49 processed_data = defaultdict ( lambda : [])
-50
-51 for datum in data :
-52 # string.ascii_lowercase
-53
-54 processed_data [ "id" ] . append ( datum . guid )
-55 processed_data [ "label" ] . append ( datum . get_label ())
-56
-57 for i in range ( text_len ):
-58 letter = string . ascii_lowercase [ i ] # abcdefghi
-59 # processed_data[text_a] = ...
-60 processed_data [ f "text_ { letter } " ] . append ( datum . texts [ i ])
-61
-62 return processed_data
-63
-64 @classmethod
-65 def from_parser_output ( cls , data ):
-66 pass
-67
-68
-69 class RelevanceExample ( InputExample ):
-70 """
-71 Converts Relevance Labels to 0 - 1
-72 """
-73
-74 def __init__ ( self , max_score = 2 , * args , ** kwargs ):
-75 super () . __init__ ( * args , ** kwargs )
-76 self . max_score = max_score
-77
-78 def get_label ( self ):
-79 return self . relevance ()
-80
-81 def relevance ( self ):
-82 """
-83 :return:
-84 Returns a normalised score for relevance between 0 - 1
-85 """
-86 return self . label / self . max_score
-87
-88
-89 class DatasetTypes ( Enum ):
-90 """
-91 A collection of common dataset types that is usable in the library.
-92 """
-93 List : "List"
-94 ListInputExample : "ListInputExample"
-95 ListDict : "ListDict"
-96 HuggingfaceDataset : "HuggingfaceDataset"
-
-
-
-
-
-
-
-
-
-
class
-
RelevanceExample (InputExample ):
-
- View Source
-
-
-
- 70 class RelevanceExample ( InputExample ):
-71 """
-72 Converts Relevance Labels to 0 - 1
-73 """
-74
-75 def __init__ ( self , max_score = 2 , * args , ** kwargs ):
-76 super () . __init__ ( * args , ** kwargs )
-77 self . max_score = max_score
-78
-79 def get_label ( self ):
-80 return self . relevance ()
-81
-82 def relevance ( self ):
-83 """
-84 :return:
-85 Returns a normalised score for relevance between 0 - 1
-86 """
-87 return self . label / self . max_score
-
-
-
- Converts Relevance Labels to 0 - 1
-
-
-
-
-
-
-
- RelevanceExample (max_score = 2 , * args , ** kwargs )
-
- View Source
-
-
-
-
75 def __init__ ( self , max_score = 2 , * args , ** kwargs ):
-76 super () . __init__ ( * args , ** kwargs )
-77 self . max_score = max_score
-
-
-
-
Creates one InputExample with the given texts, guid and label
-
-
:param guid
- id for the example
-:param texts
- the texts for the example. Note, str.strip() is called on the texts
-:param label
- the label for the example
-
-
-
-
-
-
-
-
- def
- get_label (self ):
-
- View Source
-
-
-
-
79 def get_label ( self ):
-80 return self . relevance ()
-
-
-
-
-
-
-
-
-
-
- def
- relevance (self ):
-
- View Source
-
-
-
-
82 def relevance ( self ):
-83 """
-84 :return:
-85 Returns a normalised score for relevance between 0 - 1
-86 """
-87 return self . label / self . max_score
-
-
-
-
Returns
-
-
-Returns a normalised score for relevance between 0 - 1
-
-
-
-
-
-
-
-
Inherited Members
-
-
-
-
-
-
-
-
-
- class
- DatasetTypes (enum.Enum ):
-
- View Source
-
-
-
- 90 class DatasetTypes ( Enum ):
-91 """
-92 A collection of common dataset types that is usable in the library.
-93 """
-94 List : "List"
-95 ListInputExample : "ListInputExample"
-96 ListDict : "ListDict"
-97 HuggingfaceDataset : "HuggingfaceDataset"
-
-
-
- A collection of common dataset types that is usable in the library.
-
-
-
-
-
Inherited Members
-
-
enum.Enum
- name
- value
-
-
-
-
-
-
-
-
\ No newline at end of file
diff --git a/docs/debeir/data_sets/utils.html b/docs/debeir/data_sets/utils.html
deleted file mode 100644
index 16d0d81..0000000
--- a/docs/debeir/data_sets/utils.html
+++ /dev/null
@@ -1,545 +0,0 @@
-
-
-
-
-
-
- debeir.data_sets.utils API documentation
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
- View Source
-
- 1 # TODO: Convert a Parser Return Dict (Dict[int, Dict[str, ...])
- 2
- 3 import datasets
- 4
- 5 from debeir.data_sets.types import InputExample
- 6 from debeir.evaluation.cross_validation import CrossValidator
- 7 from debeir.data_sets.types import DatasetTypes
- 8 from debeir.evaluation.evaluator import Evaluator
- 9
-10
-11 class CrossValidatorDataset :
-12 """
-13 Cross Validator Dataset
-14 """
-15 cross_val_cls : CrossValidator
-16
-17 def __init__ ( self , dataset , cross_validator , n_folds , x_attr = 'text' , y_attr = 'label' ):
-18 self . cross_val_cls = cross_validator
-19 self . dataset = dataset
-20 self . fold = 0
-21 self . n_folds = n_folds
-22 self . x_attr = x_attr
-23 self . y_attr = y_attr
-24 self . folds = []
-25
-26 @classmethod
-27 def prepare_cross_validator ( cls , data , evaluator : Evaluator ,
-28 n_splits : int , x_attr , y_attr , seed = 42 ) -> 'CrossValidatorDataset' :
-29 """
-30 Prepare the cross validator dataset object that will internally produce the folds.
-31
-32 :param data: Dataset to be used. Should be a list of dicts, or list of [x,y] or a Dataset object from data_sets
-33 :param evaluator: Evaluator to use for checking results
-34 :param n_splits: Number of cross validation splits, k-fold (stratified)
-35 :param seed: Seed to use (default 42)
-36 :param y_attr: Label, or idx of the y label
-37 :param x_attr: Label or idx of the x label (not directly used)
-38 """
-39
-40 return cls ( data , CrossValidator ( evaluator , data , x_attr , y_attr ,
-41 n_splits = n_splits , seed = seed ),
-42 x_attr = x_attr , y_attr = y_attr ,
-43 n_folds = n_splits )
-44
-45 def get_fold ( self , idx ) -> datasets . DatasetDict :
-46 """
-47
-48 Get the fold and returns a dataset.DataDict object with
-49 DataDict{'train': ..., 'val': ...}
-50
-51 :param idx:
-52 """
-53
-54 train_idxs , val_idxs = self . cross_val_cls . get_fold ( idx )
-55 dataset_dict = DatasetDict ()
-56
-57 if self . cross_val_cls . dataset_type in [ DatasetTypes . List , DatasetTypes . ListDict ]:
-58 # TODO: figure out how to make this into a huggingface dataset object generically
-59 train_subset = [ self . dataset [ i ] for i in train_idxs ]
-60 val_subset = [ self . dataset [ i ] for i in val_idxs ]
-61 elif self . cross_val_cls . dataset_type == DatasetTypes . ListInputExample :
-62 train_subset = InputExample . to_dict ([ self . dataset [ i ] for i in train_idxs ])
-63 val_subset = InputExample . to_dict ([ self . dataset [ i ] for i in val_idxs ])
-64
-65 dataset_dict [ 'train' ] = datasets . Dataset . from_dict ( train_subset )
-66 dataset_dict [ 'val' ] = datasets . Dataset . from_dict ( val_subset )
-67
-68 elif self . cross_val_cls . dataset_type == DatasetTypes . HuggingfaceDataset :
-69 train_subset = self . dataset . select ( train_idxs )
-70 val_subset = self . dataset . select ( val_idxs )
-71
-72 dataset_dict [ 'train' ] = datasets . Dataset . from_dict ( train_subset )
-73 dataset_dict [ 'val' ] = datasets . Dataset . from_dict ( val_subset )
-74
-75 return dataset_dict
-
-
-
-
-
-
-
-
- class
- CrossValidatorDataset :
-
- View Source
-
-
-
- 12 class CrossValidatorDataset :
-13 """
-14 Cross Validator Dataset
-15 """
-16 cross_val_cls : CrossValidator
-17
-18 def __init__ ( self , dataset , cross_validator , n_folds , x_attr = 'text' , y_attr = 'label' ):
-19 self . cross_val_cls = cross_validator
-20 self . dataset = dataset
-21 self . fold = 0
-22 self . n_folds = n_folds
-23 self . x_attr = x_attr
-24 self . y_attr = y_attr
-25 self . folds = []
-26
-27 @classmethod
-28 def prepare_cross_validator ( cls , data , evaluator : Evaluator ,
-29 n_splits : int , x_attr , y_attr , seed = 42 ) -> 'CrossValidatorDataset' :
-30 """
-31 Prepare the cross validator dataset object that will internally produce the folds.
-32
-33 :param data: Dataset to be used. Should be a list of dicts, or list of [x,y] or a Dataset object from data_sets
-34 :param evaluator: Evaluator to use for checking results
-35 :param n_splits: Number of cross validation splits, k-fold (stratified)
-36 :param seed: Seed to use (default 42)
-37 :param y_attr: Label, or idx of the y label
-38 :param x_attr: Label or idx of the x label (not directly used)
-39 """
-40
-41 return cls ( data , CrossValidator ( evaluator , data , x_attr , y_attr ,
-42 n_splits = n_splits , seed = seed ),
-43 x_attr = x_attr , y_attr = y_attr ,
-44 n_folds = n_splits )
-45
-46 def get_fold ( self , idx ) -> datasets . DatasetDict :
-47 """
-48
-49 Get the fold and returns a dataset.DataDict object with
-50 DataDict{'train': ..., 'val': ...}
-51
-52 :param idx:
-53 """
-54
-55 train_idxs , val_idxs = self . cross_val_cls . get_fold ( idx )
-56 dataset_dict = DatasetDict ()
-57
-58 if self . cross_val_cls . dataset_type in [ DatasetTypes . List , DatasetTypes . ListDict ]:
-59 # TODO: figure out how to make this into a huggingface dataset object generically
-60 train_subset = [ self . dataset [ i ] for i in train_idxs ]
-61 val_subset = [ self . dataset [ i ] for i in val_idxs ]
-62 elif self . cross_val_cls . dataset_type == DatasetTypes . ListInputExample :
-63 train_subset = InputExample . to_dict ([ self . dataset [ i ] for i in train_idxs ])
-64 val_subset = InputExample . to_dict ([ self . dataset [ i ] for i in val_idxs ])
-65
-66 dataset_dict [ 'train' ] = datasets . Dataset . from_dict ( train_subset )
-67 dataset_dict [ 'val' ] = datasets . Dataset . from_dict ( val_subset )
-68
-69 elif self . cross_val_cls . dataset_type == DatasetTypes . HuggingfaceDataset :
-70 train_subset = self . dataset . select ( train_idxs )
-71 val_subset = self . dataset . select ( val_idxs )
-72
-73 dataset_dict [ 'train' ] = datasets . Dataset . from_dict ( train_subset )
-74 dataset_dict [ 'val' ] = datasets . Dataset . from_dict ( val_subset )
-75
-76 return dataset_dict
-
-
-
- Cross Validator Dataset
-
-
-
-
-
-
-
- CrossValidatorDataset (dataset , cross_validator , n_folds , x_attr = 'text' , y_attr = 'label' )
-
- View Source
-
-
-
-
18 def __init__ ( self , dataset , cross_validator , n_folds , x_attr = 'text' , y_attr = 'label' ):
-19 self . cross_val_cls = cross_validator
-20 self . dataset = dataset
-21 self . fold = 0
-22 self . n_folds = n_folds
-23 self . x_attr = x_attr
-24 self . y_attr = y_attr
-25 self . folds = []
-
-
-
-
-
-
-
-
-
-
-
27 @classmethod
-28 def prepare_cross_validator ( cls , data , evaluator : Evaluator ,
-29 n_splits : int , x_attr , y_attr , seed = 42 ) -> 'CrossValidatorDataset' :
-30 """
-31 Prepare the cross validator dataset object that will internally produce the folds.
-32
-33 :param data: Dataset to be used. Should be a list of dicts, or list of [x,y] or a Dataset object from data_sets
-34 :param evaluator: Evaluator to use for checking results
-35 :param n_splits: Number of cross validation splits, k-fold (stratified)
-36 :param seed: Seed to use (default 42)
-37 :param y_attr: Label, or idx of the y label
-38 :param x_attr: Label or idx of the x label (not directly used)
-39 """
-40
-41 return cls ( data , CrossValidator ( evaluator , data , x_attr , y_attr ,
-42 n_splits = n_splits , seed = seed ),
-43 x_attr = x_attr , y_attr = y_attr ,
-44 n_folds = n_splits )
-
-
-
-
Prepare the cross validator dataset object that will internally produce the folds.
-
-
Parameters
-
-
-data : Dataset to be used. Should be a list of dicts, or list of [x,y] or a Dataset object from data_sets
-evaluator : Evaluator to use for checking results
-n_splits : Number of cross validation splits, k-fold (stratified)
-seed : Seed to use (default 42)
-y_attr : Label, or idx of the y label
-x_attr : Label or idx of the x label (not directly used)
-
-
-
-
-
-
-
-
-
- def
- get_fold (self , idx ) -> datasets . dataset_dict . DatasetDict :
-
- View Source
-
-
-
-
46 def get_fold ( self , idx ) -> datasets . DatasetDict :
-47 """
-48
-49 Get the fold and returns a dataset.DataDict object with
-50 DataDict{'train': ..., 'val': ...}
-51
-52 :param idx:
-53 """
-54
-55 train_idxs , val_idxs = self . cross_val_cls . get_fold ( idx )
-56 dataset_dict = DatasetDict ()
-57
-58 if self . cross_val_cls . dataset_type in [ DatasetTypes . List , DatasetTypes . ListDict ]:
-59 # TODO: figure out how to make this into a huggingface dataset object generically
-60 train_subset = [ self . dataset [ i ] for i in train_idxs ]
-61 val_subset = [ self . dataset [ i ] for i in val_idxs ]
-62 elif self . cross_val_cls . dataset_type == DatasetTypes . ListInputExample :
-63 train_subset = InputExample . to_dict ([ self . dataset [ i ] for i in train_idxs ])
-64 val_subset = InputExample . to_dict ([ self . dataset [ i ] for i in val_idxs ])
-65
-66 dataset_dict [ 'train' ] = datasets . Dataset . from_dict ( train_subset )
-67 dataset_dict [ 'val' ] = datasets . Dataset . from_dict ( val_subset )
-68
-69 elif self . cross_val_cls . dataset_type == DatasetTypes . HuggingfaceDataset :
-70 train_subset = self . dataset . select ( train_idxs )
-71 val_subset = self . dataset . select ( val_idxs )
-72
-73 dataset_dict [ 'train' ] = datasets . Dataset . from_dict ( train_subset )
-74 dataset_dict [ 'val' ] = datasets . Dataset . from_dict ( val_subset )
-75
-76 return dataset_dict
-
-
-
-
Get the fold and returns a dataset.DataDict object with
-DataDict{'train': ..., 'val': ...}
-
-
Parameters
-
-
-
-
-
-
-
-
-
-
\ No newline at end of file
diff --git a/docs/debeir/datasets.html b/docs/debeir/datasets.html
index 7c0bfb4..d335441 100644
--- a/docs/debeir/datasets.html
+++ b/docs/debeir/datasets.html
@@ -53,7 +53,7 @@ Submodules
- Contains data_sets implemented from nir.interfaces
+
Contains data_sets implemented from nir.core
Parser (For reading data from files into a Dict object)
@@ -69,7 +69,7 @@
View Source
1 """
-2 Contains data_sets implemented from nir.interfaces
+2 Contains data_sets implemented from nir.core
3 1. Parser (For reading data from files into a Dict object)
4 2. Query object (Generating queries)
5 - These query objects can be very lightweight containing only the mappings of the index.
diff --git a/docs/debeir/evaluation/evaluator.html b/docs/debeir/evaluation/evaluator.html
index 849f94c..6187de1 100644
--- a/docs/debeir/evaluation/evaluator.html
+++ b/docs/debeir/evaluation/evaluator.html
@@ -319,7 +319,7 @@ Returns
def
-
average_all_metrics ( self , runs : Dict , logger : < loguru . logger handlers = [( id = 0 , level = 10 , sink =< _io . StringIO object at 0x105cfa710 > )] > ):
+
average_all_metrics ( self , runs : Dict , logger : < loguru . logger handlers = [( id = 0 , level = 10 , sink =< _io . StringIO object at 0x103af2710 > )] > ):
View Source
diff --git a/docs/debeir/interfaces.html b/docs/debeir/interfaces.html
deleted file mode 100644
index b18bcb0..0000000
--- a/docs/debeir/interfaces.html
+++ /dev/null
@@ -1,254 +0,0 @@
-
-
-
-
-
-
-
debeir.interfaces API documentation
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
\ No newline at end of file
diff --git a/docs/debeir/interfaces/callbacks.html b/docs/debeir/interfaces/callbacks.html
deleted file mode 100644
index 75e339f..0000000
--- a/docs/debeir/interfaces/callbacks.html
+++ /dev/null
@@ -1,859 +0,0 @@
-
-
-
-
-
-
-
debeir.interfaces.callbacks API documentation
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
- Callbacks for before after running.
-E.g. before is for setup
-after is for evaluation/serialization etc
-
-
-
-
- View Source
-
- 1 """
- 2 Callbacks for before after running.
- 3 E.g. before is for setup
- 4 after is for evaluation/serialization etc
- 5 """
- 6
- 7 import abc
- 8 import os
- 9 import tempfile
- 10 import uuid
- 11 import loguru
- 12
- 13 from typing import List
- 14 from debeir.interfaces.pipeline import Pipeline
- 15 from debeir.data_sets.factory import query_factory
- 16 from debeir.evaluation.evaluator import Evaluator
- 17 from debeir.interfaces.config import GenericConfig , NIRConfig
- 18
- 19
- 20 class Callback :
- 21 def __init__ ( self ):
- 22 self . pipeline = None
- 23
- 24 @abc . abstractmethod
- 25 def before ( self , pipeline : Pipeline ):
- 26 pass
- 27
- 28 @abc . abstractmethod
- 29 def after ( self , results : List ):
- 30 pass
- 31
- 32
- 33 class SerializationCallback ( Callback ):
- 34 def __init__ ( self , config : GenericConfig , nir_config : NIRConfig ):
- 35 super () . __init__ ()
- 36 self . config = config
- 37 self . nir_config = nir_config
- 38 self . output_file = None
- 39 self . query_cls = query_factory [ self . config . query_fn ]
- 40
- 41 def before ( self , pipeline : Pipeline ):
- 42 """
- 43 Check if output file exists
- 44
- 45 :return:
- 46 Output file path
- 47 """
- 48
- 49 self . pipeline = Pipeline
- 50
- 51 output_file = self . config . output_file
- 52 output_dir = os . path . join ( self . nir_config . output_directory , self . config . index )
- 53
- 54 if output_file is None :
- 55 os . makedirs ( name = output_dir , exist_ok = True )
- 56 output_file = os . path . join ( output_dir , str ( uuid . uuid4 ()))
- 57
- 58 loguru . logger . info ( f "Output file not specified, writing to: { output_file } " )
- 59
- 60 else :
- 61 output_file = os . path . join ( output_dir , output_file )
- 62
- 63 if os . path . exists ( output_file ):
- 64 if not self . config . overwrite_output_if_exists :
- 65 raise RuntimeError ( "Directory exists and isn't explicitly overwritten "
- 66 "in config with overwrite_output_if_exists=True" )
- 67
- 68 loguru . logger . info ( f "Output file exists: { output_file } . Overwriting..." )
- 69 open ( output_file , "w+" ) . close ()
- 70
- 71 pipeline . output_file = output_file
- 72 self . output_file = output_file
- 73
- 74 def after ( self , results : List ):
- 75 """
- 76 Serialize results to self.output_file in a TREC-style format
- 77 :param topic_num: Topic number to serialize
- 78 :param res: Raw elasticsearch result
- 79 :param run_name: The run name for TREC-style runs (default: NO_RUN_NAME)
- 80 """
- 81
- 82 self . _after ( results ,
- 83 output_file = self . output_file ,
- 84 query_cls = self . query_cls ,
- 85 run_name = self . config . run_name )
- 86
- 87 @classmethod
- 88 def _after ( self , results : List , output_file , query_cls , run_name = None ):
- 89 if run_name is None :
- 90 run_name = "NO_RUN_NAME"
- 91
- 92 with open ( output_file , "a+t" ) as writer :
- 93 for ( topic_num , res ) in results :
- 94 for rank , result in enumerate ( res [ "hits" ][ "hits" ], start = 1 ):
- 95 doc_id = None
- 96
- 97 # if self.return_id_only:
- 98 # doc_id = self.query.get_id_mapping(result["fields"])[0]
- 99 # else:
-100 doc_id = query_cls . get_id_mapping ( result [ "_source" ])
-101
-102 line = f " { topic_num } \t " \
-103 f "Q0 \t " \
-104 f " { doc_id } \t " \
-105 f " { rank } \t " \
-106 f " { result [ '_score' ] } \t " \
-107 f " { run_name } \n "
-108
-109 writer . write ( line )
-110
-111
-112 class EvaluationCallback ( Callback ):
-113 def __init__ ( self , evaluator : Evaluator , config ):
-114 super () . __init__ ()
-115 self . evaluator = evaluator
-116 self . config = config
-117 self . parsed_run = None
-118
-119 def before ( self , pipeline : Pipeline ):
-120 self . pipeline = Pipeline
-121
-122 def after ( self , results : List , id_field = "id" ):
-123 if self . pipeline . output_file is None :
-124 directory_name = tempfile . mkdtemp ()
-125 fn = str ( uuid . uuid4 ())
-126
-127 fp = os . path . join ( directory_name , fn )
-128
-129 query = query_factory [ self . config . query_fn ]
-130 query . id_field = id_field
-131
-132 SerializationCallback . _after ( results ,
-133 output_file = fp ,
-134 query_cls = query ,
-135 run_name = self . config . run_name )
-136
-137 self . pipeline . output_file = fp
-138
-139 parsed_run = self . evaluator . evaluate_runs ( self . pipeline . output_file , disable_cache = True )
-140 self . parsed_run = parsed_run
-141
-142 return self . parsed_run
-
-
-
-
-
-
-
-
- class
- Callback :
-
- View Source
-
-
-
- 21 class Callback :
-22 def __init__ ( self ):
-23 self . pipeline = None
-24
-25 @abc . abstractmethod
-26 def before ( self , pipeline : Pipeline ):
-27 pass
-28
-29 @abc . abstractmethod
-30 def after ( self , results : List ):
-31 pass
-
-
-
-
-
-
-
-
-
- Callback ()
-
- View Source
-
-
-
-
22 def __init__ ( self ):
-23 self . pipeline = None
-
-
-
-
-
-
-
-
-
-
-
25 @abc . abstractmethod
-26 def before ( self , pipeline : Pipeline ):
-27 pass
-
-
-
-
-
-
-
-
-
-
@abc.abstractmethod
-
-
def
-
after (self , results : List ):
-
-
View Source
-
-
-
-
29 @abc . abstractmethod
-30 def after ( self , results : List ):
-31 pass
-
-
-
-
-
-
-
-
-
-
-
-
class
-
SerializationCallback (Callback ):
-
- View Source
-
-
-
- 34 class SerializationCallback ( Callback ):
- 35 def __init__ ( self , config : GenericConfig , nir_config : NIRConfig ):
- 36 super () . __init__ ()
- 37 self . config = config
- 38 self . nir_config = nir_config
- 39 self . output_file = None
- 40 self . query_cls = query_factory [ self . config . query_fn ]
- 41
- 42 def before ( self , pipeline : Pipeline ):
- 43 """
- 44 Check if output file exists
- 45
- 46 :return:
- 47 Output file path
- 48 """
- 49
- 50 self . pipeline = Pipeline
- 51
- 52 output_file = self . config . output_file
- 53 output_dir = os . path . join ( self . nir_config . output_directory , self . config . index )
- 54
- 55 if output_file is None :
- 56 os . makedirs ( name = output_dir , exist_ok = True )
- 57 output_file = os . path . join ( output_dir , str ( uuid . uuid4 ()))
- 58
- 59 loguru . logger . info ( f "Output file not specified, writing to: { output_file } " )
- 60
- 61 else :
- 62 output_file = os . path . join ( output_dir , output_file )
- 63
- 64 if os . path . exists ( output_file ):
- 65 if not self . config . overwrite_output_if_exists :
- 66 raise RuntimeError ( "Directory exists and isn't explicitly overwritten "
- 67 "in config with overwrite_output_if_exists=True" )
- 68
- 69 loguru . logger . info ( f "Output file exists: { output_file } . Overwriting..." )
- 70 open ( output_file , "w+" ) . close ()
- 71
- 72 pipeline . output_file = output_file
- 73 self . output_file = output_file
- 74
- 75 def after ( self , results : List ):
- 76 """
- 77 Serialize results to self.output_file in a TREC-style format
- 78 :param topic_num: Topic number to serialize
- 79 :param res: Raw elasticsearch result
- 80 :param run_name: The run name for TREC-style runs (default: NO_RUN_NAME)
- 81 """
- 82
- 83 self . _after ( results ,
- 84 output_file = self . output_file ,
- 85 query_cls = self . query_cls ,
- 86 run_name = self . config . run_name )
- 87
- 88 @classmethod
- 89 def _after ( self , results : List , output_file , query_cls , run_name = None ):
- 90 if run_name is None :
- 91 run_name = "NO_RUN_NAME"
- 92
- 93 with open ( output_file , "a+t" ) as writer :
- 94 for ( topic_num , res ) in results :
- 95 for rank , result in enumerate ( res [ "hits" ][ "hits" ], start = 1 ):
- 96 doc_id = None
- 97
- 98 # if self.return_id_only:
- 99 # doc_id = self.query.get_id_mapping(result["fields"])[0]
-100 # else:
-101 doc_id = query_cls . get_id_mapping ( result [ "_source" ])
-102
-103 line = f " { topic_num } \t " \
-104 f "Q0 \t " \
-105 f " { doc_id } \t " \
-106 f " { rank } \t " \
-107 f " { result [ '_score' ] } \t " \
-108 f " { run_name } \n "
-109
-110 writer . write ( line )
-
-
-
-
-
-
-
-
-
-
35 def __init__ ( self , config : GenericConfig , nir_config : NIRConfig ):
-36 super () . __init__ ()
-37 self . config = config
-38 self . nir_config = nir_config
-39 self . output_file = None
-40 self . query_cls = query_factory [ self . config . query_fn ]
-
-
-
-
-
-
-
-
-
-
-
42 def before ( self , pipeline : Pipeline ):
-43 """
-44 Check if output file exists
-45
-46 :return:
-47 Output file path
-48 """
-49
-50 self . pipeline = Pipeline
-51
-52 output_file = self . config . output_file
-53 output_dir = os . path . join ( self . nir_config . output_directory , self . config . index )
-54
-55 if output_file is None :
-56 os . makedirs ( name = output_dir , exist_ok = True )
-57 output_file = os . path . join ( output_dir , str ( uuid . uuid4 ()))
-58
-59 loguru . logger . info ( f "Output file not specified, writing to: { output_file } " )
-60
-61 else :
-62 output_file = os . path . join ( output_dir , output_file )
-63
-64 if os . path . exists ( output_file ):
-65 if not self . config . overwrite_output_if_exists :
-66 raise RuntimeError ( "Directory exists and isn't explicitly overwritten "
-67 "in config with overwrite_output_if_exists=True" )
-68
-69 loguru . logger . info ( f "Output file exists: { output_file } . Overwriting..." )
-70 open ( output_file , "w+" ) . close ()
-71
-72 pipeline . output_file = output_file
-73 self . output_file = output_file
-
-
-
-
Check if output file exists
-
-
Returns
-
-
-Output file path
-
-
-
-
-
-
-
-
-
-
- def
- after (self , results : List ):
-
- View Source
-
-
-
-
75 def after ( self , results : List ):
-76 """
-77 Serialize results to self.output_file in a TREC-style format
-78 :param topic_num: Topic number to serialize
-79 :param res: Raw elasticsearch result
-80 :param run_name: The run name for TREC-style runs (default: NO_RUN_NAME)
-81 """
-82
-83 self . _after ( results ,
-84 output_file = self . output_file ,
-85 query_cls = self . query_cls ,
-86 run_name = self . config . run_name )
-
-
-
-
Serialize results to self.output_file in a TREC-style format
-
-
Parameters
-
-
-topic_num : Topic number to serialize
-res : Raw elasticsearch result
-run_name: The run name for TREC-style runs (default : NO_RUN_NAME)
-
-
-
-
-
-
-
-
-
-
-
class
-
EvaluationCallback (Callback ):
-
- View Source
-
-
-
- 113 class EvaluationCallback ( Callback ):
-114 def __init__ ( self , evaluator : Evaluator , config ):
-115 super () . __init__ ()
-116 self . evaluator = evaluator
-117 self . config = config
-118 self . parsed_run = None
-119
-120 def before ( self , pipeline : Pipeline ):
-121 self . pipeline = Pipeline
-122
-123 def after ( self , results : List , id_field = "id" ):
-124 if self . pipeline . output_file is None :
-125 directory_name = tempfile . mkdtemp ()
-126 fn = str ( uuid . uuid4 ())
-127
-128 fp = os . path . join ( directory_name , fn )
-129
-130 query = query_factory [ self . config . query_fn ]
-131 query . id_field = id_field
-132
-133 SerializationCallback . _after ( results ,
-134 output_file = fp ,
-135 query_cls = query ,
-136 run_name = self . config . run_name )
-137
-138 self . pipeline . output_file = fp
-139
-140 parsed_run = self . evaluator . evaluate_runs ( self . pipeline . output_file , disable_cache = True )
-141 self . parsed_run = parsed_run
-142
-143 return self . parsed_run
-
-
-
-
-
-
-
-
-
-
114 def __init__ ( self , evaluator : Evaluator , config ):
-115 super () . __init__ ()
-116 self . evaluator = evaluator
-117 self . config = config
-118 self . parsed_run = None
-
-
-
-
-
-
-
-
-
-
-
120 def before ( self , pipeline : Pipeline ):
-121 self . pipeline = Pipeline
-
-
-
-
-
-
-
-
-
-
- def
- after (self , results : List , id_field = 'id' ):
-
- View Source
-
-
-
-
123 def after ( self , results : List , id_field = "id" ):
-124 if self . pipeline . output_file is None :
-125 directory_name = tempfile . mkdtemp ()
-126 fn = str ( uuid . uuid4 ())
-127
-128 fp = os . path . join ( directory_name , fn )
-129
-130 query = query_factory [ self . config . query_fn ]
-131 query . id_field = id_field
-132
-133 SerializationCallback . _after ( results ,
-134 output_file = fp ,
-135 query_cls = query ,
-136 run_name = self . config . run_name )
-137
-138 self . pipeline . output_file = fp
-139
-140 parsed_run = self . evaluator . evaluate_runs ( self . pipeline . output_file , disable_cache = True )
-141 self . parsed_run = parsed_run
-142
-143 return self . parsed_run
-
-
-
-
-
-
-
-
-
-
\ No newline at end of file
diff --git a/docs/debeir/interfaces/config.html b/docs/debeir/interfaces/config.html
deleted file mode 100644
index f79f8c7..0000000
--- a/docs/debeir/interfaces/config.html
+++ /dev/null
@@ -1,1480 +0,0 @@
-
-
-
-
-
-
-
debeir.interfaces.config API documentation
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
- View Source
-
- 1 import abc
- 2 import dataclasses
- 3 import os
- 4 from abc import ABC
- 5 from dataclasses import dataclass
- 6 from pathlib import Path
- 7 from typing import List , MutableMapping , Dict , Union
- 8
- 9 import loguru
- 10 import toml
- 11
- 12
- 13 class Config :
- 14 """
- 15 Config Interface with creation class methods
- 16 """
- 17
- 18 def __update__ ( self , ** kwargs ):
- 19 attrs = vars ( self )
- 20 kwargs . update ( attrs )
- 21
- 22 return kwargs
- 23
- 24 @classmethod
- 25 def from_toml ( cls , fp : Union [ str , Path ], field_class , * args , ** kwargs ) -> 'Config' :
- 26 """
- 27 Instantiates a Config object from a toml file
- 28
- 29 :param fp: File path of the Config TOML file
- 30 :param field_class: Class of the Config object to be instantiated
- 31 :param args: Arguments to be passed to Config
- 32 :param kwargs: Keyword arguments to be passed
- 33 :return:
- 34 A instantiated and validated Config object.
- 35 """
- 36 args_dict = toml . load ( fp )
- 37
- 38 return cls . from_args ( args_dict , field_class , * args , ** kwargs )
- 39
- 40 @classmethod
- 41 def from_args ( cls , args_dict : MutableMapping , field_class , * args , ** kwargs ):
- 42 """
- 43 Instantiates a Config object from arguments
- 44
- 45
- 46 :param args_dict:
- 47 :param field_class:
- 48 :param args:
- 49 :param kwargs:
- 50 :return:
- 51 """
- 52 from debeir.rankers.transformer_sent_encoder import Encoder
- 53
- 54 field_names = set ( f . name for f in dataclasses . fields ( field_class ))
- 55 obj = field_class ( ** { k : v for k , v in args_dict . items () if k in field_names })
- 56 if hasattr ( obj , 'encoder_fp' ) and obj . encoder_fp :
- 57 obj . encoder = Encoder ( obj . encoder_fp , obj . encoder_normalize )
- 58
- 59 obj . validate ()
- 60
- 61 return obj
- 62
- 63 @classmethod
- 64 def from_dict ( cls , data_class , ** kwargs ):
- 65 """
- 66 Instantiates a Config object from a dictionary
- 67
- 68 :param data_class:
- 69 :param kwargs:
- 70 :return:
- 71 """
- 72 from debeir.rankers.transformer_sent_encoder import Encoder
- 73
- 74 if "encoder_fp" in kwargs and kwargs [ "encoder_fp" ]:
- 75 kwargs [ "encoder" ] = Encoder ( kwargs [ "encoder_fp" ])
- 76
- 77 field_names = set ( f . name for f in dataclasses . fields ( data_class ))
- 78 obj = data_class ( ** { k : v for k , v in kwargs . items () if k in field_names })
- 79 obj . validate ( 0 )
- 80
- 81 return obj
- 82
- 83 @abc . abstractmethod
- 84 def validate ( self ):
- 85 """
- 86 Validates if the config is correct.
- 87 Must be implemented by inherited classes.
- 88 """
- 89 pass
- 90
- 91
- 92 @dataclass ( init = True , unsafe_hash = True )
- 93 class GenericConfig ( Config , ABC ):
- 94 """
- 95 Generic NIR Configuration file for which all configs will inherit
- 96 """
- 97 query_type : str
- 98 index : str = None
- 99 encoder_normalize : bool = True
-100 ablations : bool = False
-101 norm_weight : float = None
-102 automatic : bool = None
-103 encoder : object = None
-104 encoder_fp : str = None
-105 query_weights : List [ float ] = None
-106 cosine_weights : List [ float ] = None
-107 evaluate : bool = False
-108 qrels : str = None
-109 config_fn : str = None
-110 query_fn : str = None
-111 parser_fn : str = None
-112 executor_fn : str = None
-113 cosine_ceiling : float = None
-114 topics_path : str = None
-115 return_id_only : bool = False
-116 overwrite_output_if_exists : bool = False
-117 output_file : str = None
-118 run_name : str = None
-119
-120
-121 @classmethod
-122 def from_toml ( cls , fp : Union [ str , Path ], * args , ** kwargs ) -> 'GenericConfig' :
-123 return Config . from_toml ( fp , cls , * args , ** kwargs )
-124
-125
-126 @dataclass ( init = True )
-127 class _NIRMasterConfig ( Config ):
-128 """
-129 Base NIR Master config: nir.toml
-130 """
-131 metrics : Dict
-132 search : Dict
-133 nir : Dict
-134
-135 def get_metrics ( self , key = 'common' , return_as_instance = False ):
-136 metrics = self . metrics [ key ]
-137 if return_as_instance :
-138 return MetricsConfig . from_args ( metrics , MetricsConfig )
-139
-140 return metrics
-141
-142 def get_search_engine_settings ( self , key = 'elasticsearch' , return_as_instance = False ):
-143 engine_settings = self . search [ 'engines' ][ key ]
-144 if return_as_instance :
-145 return ElasticsearchConfig . from_args ( engine_settings , ElasticsearchConfig )
-146
-147 return engine_settings
-148
-149
-150 def get_nir_settings ( self , key = 'default_settings' , return_as_instance = False ):
-151 nir_settings = self . nir [ key ]
-152
-153 if return_as_instance :
-154 return NIRConfig . from_args ( nir_settings , NIRConfig )
-155
-156 return nir_settings
-157
-158 def validate ( self ):
-159 return True
-160
-161
-162 @dataclass ( init = True )
-163 class ElasticsearchConfig ( Config ):
-164 """
-165 Basic Elasticsearch configuration file settings from the master nir.toml file
-166 """
-167 protocol : str
-168 ip : str
-169 port : str
-170 timeout : int
-171
-172 def validate ( self ):
-173 """
-174 Checks if Elasticsearch URL is correct
-175 """
-176 assert self . protocol in [ 'http' , 'https' ]
-177 assert self . port . isdigit ()
-178
-179
-180 @dataclass ( init = True )
-181 class SolrConfig ( ElasticsearchConfig ):
-182 """
-183 Basic Solr configuration file settings from the master nir.toml file
-184 """
-185 pass
-186
-187
-188 @dataclass ( init = True )
-189 class MetricsConfig ( Config ):
-190 """
-191 Basic Metrics configuration file settings from the master nir.toml file
-192 """
-193 metrics : List [ str ]
-194
-195 def validate ( self ):
-196 """
-197 Checks if each Metrics is usable by evaluator classes
-198 """
-199 for metric in self . metrics :
-200 assert "@" in metric
-201
-202 metric , depth = metric . split ( "@" )
-203
-204 assert metric . isalpha ()
-205 assert depth . isdigit ()
-206
-207
-208 @dataclass ( init = True )
-209 class NIRConfig ( Config ):
-210 """
-211 Basic NIR configuration file settings from the master nir.toml file
-212 """
-213 norm_weight : str
-214 evaluate : bool
-215 return_size : int
-216 output_directory : str
-217
-218 def validate ( self ):
-219 return True
-220
-221
-222 def apply_config ( func ):
-223 """
-224 Configuration decorator.
-225
-226 :param func: Decorated function
-227 :return:
-228 """
-229
-230 def use_config ( self , * args , ** kwargs ):
-231 """
-232 Replaces keywords and args passed to the function with ones from self.config.
-233
-234 :param self:
-235 :param args: To be updated
-236 :param kwargs: To be updated
-237 :return:
-238 """
-239 if self . config is not None :
-240 kwargs = self . config . __update__ ( ** kwargs )
-241
-242 return func ( self , * args , ** kwargs )
-243
-244 return use_config
-245
-246
-247 def override_with_toml_config ( func ):
-248 """
-249 Configuration decorator. Overwrite a functions kwargs and args with a specified toml config file.
-250 Pass override_with_config=path/to/config
-251
-252 :param func: Decorated function
-253 :return:
-254 """
-255
-256 def override_with ( override_with_config_ : str = None , * args , ** kwargs ):
-257 """
-258 Replaces keywords and args passed to the function with ones from self.config.
-259
-260 :param override_with_config_: Path to config else None
-261 :param args: To be updated
-262 :param kwargs: To be updated
-263 :return:
-264 """
-265
-266 if f "override_ { func . __name__ } _with_config_" in kwargs :
-267 override_with_config_ = f "override_ { func . __name__ } _with_config_"
-268
-269 if override_with_config_ is not None :
-270 if os . path . exists ( override_with_config_ ):
-271 toml_kwargs = toml . load ( override_with_config_ )
-272 kwargs = kwargs . update ( ** toml_kwargs )
-273
-274 return func ( * args , ** kwargs )
-275
-276 return override_with
-277
-278
-279 def save_kwargs_to_file ( func ):
-280 def save_kwargs ( save_kwargs_to_ : str = None , * args , ** kwargs ):
-281 """
-282 Save kwargs passed to the function output_file = f"{save_kwargs_to_}_{func.__name__}.toml"
-283
-284 :param save_kwargs_to_: Path to save location for config else None. This should be a DIRECTORY.
-285 :param args: To be updated
-286 :param kwargs: To be updated
-287 :return:
-288 """
-289 if save_kwargs_to_ is not None :
-290 os . makedirs ( save_kwargs_to_ , exist_ok = True )
-291
-292 if os . path . exists ( save_kwargs_to_ ):
-293 output_file = f " { save_kwargs_to_ } / { func . __name__ } .toml"
-294 loguru . logger . info ( f "Saving kwargs to { output_file } " )
-295 toml . dump ( kwargs , open ( output_file , "w+" ))
-296
-297 return func ( * args , ** kwargs )
-298
-299 return save_kwargs
-
-
-
-
-
-
-
-
- class
- Config :
-
- View Source
-
-
-
- 14 class Config :
-15 """
-16 Config Interface with creation class methods
-17 """
-18
-19 def __update__ ( self , ** kwargs ):
-20 attrs = vars ( self )
-21 kwargs . update ( attrs )
-22
-23 return kwargs
-24
-25 @classmethod
-26 def from_toml ( cls , fp : Union [ str , Path ], field_class , * args , ** kwargs ) -> 'Config' :
-27 """
-28 Instantiates a Config object from a toml file
-29
-30 :param fp: File path of the Config TOML file
-31 :param field_class: Class of the Config object to be instantiated
-32 :param args: Arguments to be passed to Config
-33 :param kwargs: Keyword arguments to be passed
-34 :return:
-35 A instantiated and validated Config object.
-36 """
-37 args_dict = toml . load ( fp )
-38
-39 return cls . from_args ( args_dict , field_class , * args , ** kwargs )
-40
-41 @classmethod
-42 def from_args ( cls , args_dict : MutableMapping , field_class , * args , ** kwargs ):
-43 """
-44 Instantiates a Config object from arguments
-45
-46
-47 :param args_dict:
-48 :param field_class:
-49 :param args:
-50 :param kwargs:
-51 :return:
-52 """
-53 from debeir.rankers.transformer_sent_encoder import Encoder
-54
-55 field_names = set ( f . name for f in dataclasses . fields ( field_class ))
-56 obj = field_class ( ** { k : v for k , v in args_dict . items () if k in field_names })
-57 if hasattr ( obj , 'encoder_fp' ) and obj . encoder_fp :
-58 obj . encoder = Encoder ( obj . encoder_fp , obj . encoder_normalize )
-59
-60 obj . validate ()
-61
-62 return obj
-63
-64 @classmethod
-65 def from_dict ( cls , data_class , ** kwargs ):
-66 """
-67 Instantiates a Config object from a dictionary
-68
-69 :param data_class:
-70 :param kwargs:
-71 :return:
-72 """
-73 from debeir.rankers.transformer_sent_encoder import Encoder
-74
-75 if "encoder_fp" in kwargs and kwargs [ "encoder_fp" ]:
-76 kwargs [ "encoder" ] = Encoder ( kwargs [ "encoder_fp" ])
-77
-78 field_names = set ( f . name for f in dataclasses . fields ( data_class ))
-79 obj = data_class ( ** { k : v for k , v in kwargs . items () if k in field_names })
-80 obj . validate ( 0 )
-81
-82 return obj
-83
-84 @abc . abstractmethod
-85 def validate ( self ):
-86 """
-87 Validates if the config is correct.
-88 Must be implemented by inherited classes.
-89 """
-90 pass
-
-
-
- Config Interface with creation class methods
-
-
-
-
-
-
- Config ()
-
-
-
-
-
-
-
-
-
-
-
-
@classmethod
-
-
def
-
from_toml ( cls , fp : Union [ str , pathlib . Path ] , field_class , * args , ** kwargs ) -> debeir.interfaces.config.Config :
-
-
View Source
-
-
-
-
25 @classmethod
-26 def from_toml ( cls , fp : Union [ str , Path ], field_class , * args , ** kwargs ) -> 'Config' :
-27 """
-28 Instantiates a Config object from a toml file
-29
-30 :param fp: File path of the Config TOML file
-31 :param field_class: Class of the Config object to be instantiated
-32 :param args: Arguments to be passed to Config
-33 :param kwargs: Keyword arguments to be passed
-34 :return:
-35 A instantiated and validated Config object.
-36 """
-37 args_dict = toml . load ( fp )
-38
-39 return cls . from_args ( args_dict , field_class , * args , ** kwargs )
-
-
-
-
Instantiates a Config object from a toml file
-
-
Parameters
-
-
-fp : File path of the Config TOML file
-field_class : Class of the Config object to be instantiated
-args : Arguments to be passed to Config
-kwargs : Keyword arguments to be passed
-
-
-
Returns
-
-
-A instantiated and validated Config object.
-
-
-
-
-
-
-
-
-
-
@classmethod
-
-
def
-
from_args (cls , args_dict : MutableMapping , field_class , * args , ** kwargs ):
-
-
View Source
-
-
-
-
41 @classmethod
-42 def from_args ( cls , args_dict : MutableMapping , field_class , * args , ** kwargs ):
-43 """
-44 Instantiates a Config object from arguments
-45
-46
-47 :param args_dict:
-48 :param field_class:
-49 :param args:
-50 :param kwargs:
-51 :return:
-52 """
-53 from debeir.rankers.transformer_sent_encoder import Encoder
-54
-55 field_names = set ( f . name for f in dataclasses . fields ( field_class ))
-56 obj = field_class ( ** { k : v for k , v in args_dict . items () if k in field_names })
-57 if hasattr ( obj , 'encoder_fp' ) and obj . encoder_fp :
-58 obj . encoder = Encoder ( obj . encoder_fp , obj . encoder_normalize )
-59
-60 obj . validate ()
-61
-62 return obj
-
-
-
-
Instantiates a Config object from arguments
-
-
Parameters
-
-
-args_dict :
-field_class :
-args :
-kwargs :
-
-
-
Returns
-
-
-
-
-
-
-
-
@classmethod
-
-
def
-
from_dict (cls , data_class , ** kwargs ):
-
-
View Source
-
-
-
-
64 @classmethod
-65 def from_dict ( cls , data_class , ** kwargs ):
-66 """
-67 Instantiates a Config object from a dictionary
-68
-69 :param data_class:
-70 :param kwargs:
-71 :return:
-72 """
-73 from debeir.rankers.transformer_sent_encoder import Encoder
-74
-75 if "encoder_fp" in kwargs and kwargs [ "encoder_fp" ]:
-76 kwargs [ "encoder" ] = Encoder ( kwargs [ "encoder_fp" ])
-77
-78 field_names = set ( f . name for f in dataclasses . fields ( data_class ))
-79 obj = data_class ( ** { k : v for k , v in kwargs . items () if k in field_names })
-80 obj . validate ( 0 )
-81
-82 return obj
-
-
-
-
Instantiates a Config object from a dictionary
-
-
Parameters
-
-
-
-
Returns
-
-
-
-
-
-
-
-
@abc.abstractmethod
-
-
def
-
validate (self ):
-
-
View Source
-
-
-
-
84 @abc . abstractmethod
-85 def validate ( self ):
-86 """
-87 Validates if the config is correct.
-88 Must be implemented by inherited classes.
-89 """
-90 pass
-
-
-
-
Validates if the config is correct.
-Must be implemented by inherited classes.
-
-
-
-
-
-
-
-
-
@dataclass(init=True, unsafe_hash=True)
-
-
class
-
GenericConfig (Config , abc.ABC ):
-
- View Source
-
-
-
- 93 @dataclass ( init = True , unsafe_hash = True )
- 94 class GenericConfig ( Config , ABC ):
- 95 """
- 96 Generic NIR Configuration file for which all configs will inherit
- 97 """
- 98 query_type : str
- 99 index : str = None
-100 encoder_normalize : bool = True
-101 ablations : bool = False
-102 norm_weight : float = None
-103 automatic : bool = None
-104 encoder : object = None
-105 encoder_fp : str = None
-106 query_weights : List [ float ] = None
-107 cosine_weights : List [ float ] = None
-108 evaluate : bool = False
-109 qrels : str = None
-110 config_fn : str = None
-111 query_fn : str = None
-112 parser_fn : str = None
-113 executor_fn : str = None
-114 cosine_ceiling : float = None
-115 topics_path : str = None
-116 return_id_only : bool = False
-117 overwrite_output_if_exists : bool = False
-118 output_file : str = None
-119 run_name : str = None
-120
-121
-122 @classmethod
-123 def from_toml ( cls , fp : Union [ str , Path ], * args , ** kwargs ) -> 'GenericConfig' :
-124 return Config . from_toml ( fp , cls , * args , ** kwargs )
-
-
-
- Generic NIR Configuration file for which all configs will inherit
-
-
-
-
-
-
- GenericConfig ( query_type : str , index : str = None , encoder_normalize : bool = True , ablations : bool = False , norm_weight : float = None , automatic : bool = None , encoder : object = None , encoder_fp : str = None , query_weights : List [ float ] = None , cosine_weights : List [ float ] = None , evaluate : bool = False , qrels : str = None , config_fn : str = None , query_fn : str = None , parser_fn : str = None , executor_fn : str = None , cosine_ceiling : float = None , topics_path : str = None , return_id_only : bool = False , overwrite_output_if_exists : bool = False , output_file : str = None , run_name : str = None )
-
-
-
-
-
-
-
-
-
-
-
-
-
122 @classmethod
-123 def from_toml ( cls , fp : Union [ str , Path ], * args , ** kwargs ) -> 'GenericConfig' :
-124 return Config . from_toml ( fp , cls , * args , ** kwargs )
-
-
-
-
Instantiates a Config object from a toml file
-
-
Parameters
-
-
-fp : File path of the Config TOML file
-field_class : Class of the Config object to be instantiated
-args : Arguments to be passed to Config
-kwargs : Keyword arguments to be passed
-
-
-
Returns
-
-
-A instantiated and validated Config object.
-
-
-
-
-
-
-
-
Inherited Members
-
-
-
-
-
-
-
-
-
@dataclass(init=True)
-
-
class
-
ElasticsearchConfig (Config ):
-
- View Source
-
-
-
- 163 @dataclass ( init = True )
-164 class ElasticsearchConfig ( Config ):
-165 """
-166 Basic Elasticsearch configuration file settings from the master nir.toml file
-167 """
-168 protocol : str
-169 ip : str
-170 port : str
-171 timeout : int
-172
-173 def validate ( self ):
-174 """
-175 Checks if Elasticsearch URL is correct
-176 """
-177 assert self . protocol in [ 'http' , 'https' ]
-178 assert self . port . isdigit ()
-
-
-
- Basic Elasticsearch configuration file settings from the master nir.toml file
-
-
-
-
-
-
- ElasticsearchConfig (protocol : str , ip : str , port : str , timeout : int )
-
-
-
-
-
-
-
-
-
-
-
-
- def
- validate (self ):
-
- View Source
-
-
-
-
173 def validate ( self ):
-174 """
-175 Checks if Elasticsearch URL is correct
-176 """
-177 assert self . protocol in [ 'http' , 'https' ]
-178 assert self . port . isdigit ()
-
-
-
-
Checks if Elasticsearch URL is correct
-
-
-
-
-
-
Inherited Members
-
-
-
-
-
-
-
-
-
- 181 @dataclass ( init = True )
-182 class SolrConfig ( ElasticsearchConfig ):
-183 """
-184 Basic Solr configuration file settings from the master nir.toml file
-185 """
-186 pass
-
-
-
- Basic Solr configuration file settings from the master nir.toml file
-
-
-
-
-
-
- SolrConfig (protocol : str , ip : str , port : str , timeout : int )
-
-
-
-
-
-
-
-
-
-
Inherited Members
-
-
-
-
-
-
-
-
-
-
@dataclass(init=True)
-
-
class
-
MetricsConfig (Config ):
-
- View Source
-
-
-
- 189 @dataclass ( init = True )
-190 class MetricsConfig ( Config ):
-191 """
-192 Basic Metrics configuration file settings from the master nir.toml file
-193 """
-194 metrics : List [ str ]
-195
-196 def validate ( self ):
-197 """
-198 Checks if each Metrics is usable by evaluator classes
-199 """
-200 for metric in self . metrics :
-201 assert "@" in metric
-202
-203 metric , depth = metric . split ( "@" )
-204
-205 assert metric . isalpha ()
-206 assert depth . isdigit ()
-
-
-
- Basic Metrics configuration file settings from the master nir.toml file
-
-
-
-
-
-
- MetricsConfig (metrics : List [ str ] )
-
-
-
-
-
-
-
-
-
-
-
-
- def
- validate (self ):
-
- View Source
-
-
-
-
196 def validate ( self ):
-197 """
-198 Checks if each Metrics is usable by evaluator classes
-199 """
-200 for metric in self . metrics :
-201 assert "@" in metric
-202
-203 metric , depth = metric . split ( "@" )
-204
-205 assert metric . isalpha ()
-206 assert depth . isdigit ()
-
-
-
-
Checks if each Metrics is usable by evaluator classes
-
-
-
-
-
-
Inherited Members
-
-
-
-
-
-
-
-
-
@dataclass(init=True)
-
-
class
-
NIRConfig (Config ):
-
- View Source
-
-
-
- 209 @dataclass ( init = True )
-210 class NIRConfig ( Config ):
-211 """
-212 Basic NIR configuration file settings from the master nir.toml file
-213 """
-214 norm_weight : str
-215 evaluate : bool
-216 return_size : int
-217 output_directory : str
-218
-219 def validate ( self ):
-220 return True
-
-
-
- Basic NIR configuration file settings from the master nir.toml file
-
-
-
-
-
-
- NIRConfig ( norm_weight : str , evaluate : bool , return_size : int , output_directory : str )
-
-
-
-
-
-
-
-
-
-
-
-
- def
- validate (self ):
-
- View Source
-
-
-
-
219 def validate ( self ):
-220 return True
-
-
-
-
Validates if the config is correct.
-Must be implemented by inherited classes.
-
-
-
-
-
-
Inherited Members
-
-
-
-
-
-
-
-
-
- def
- apply_config (func ):
-
- View Source
-
-
-
- 223 def apply_config ( func ):
-224 """
-225 Configuration decorator.
-226
-227 :param func: Decorated function
-228 :return:
-229 """
-230
-231 def use_config ( self , * args , ** kwargs ):
-232 """
-233 Replaces keywords and args passed to the function with ones from self.config.
-234
-235 :param self:
-236 :param args: To be updated
-237 :param kwargs: To be updated
-238 :return:
-239 """
-240 if self . config is not None :
-241 kwargs = self . config . __update__ ( ** kwargs )
-242
-243 return func ( self , * args , ** kwargs )
-244
-245 return use_config
-
-
-
- Configuration decorator.
-
-
Parameters
-
-
-func : Decorated function
-
-
-
Returns
-
-
-
-
-
-
-
-
- def
- override_with_toml_config (func ):
-
- View Source
-
-
-
- 248 def override_with_toml_config ( func ):
-249 """
-250 Configuration decorator. Overwrite a functions kwargs and args with a specified toml config file.
-251 Pass override_with_config=path/to/config
-252
-253 :param func: Decorated function
-254 :return:
-255 """
-256
-257 def override_with ( override_with_config_ : str = None , * args , ** kwargs ):
-258 """
-259 Replaces keywords and args passed to the function with ones from self.config.
-260
-261 :param override_with_config_: Path to config else None
-262 :param args: To be updated
-263 :param kwargs: To be updated
-264 :return:
-265 """
-266
-267 if f "override_ { func . __name__ } _with_config_" in kwargs :
-268 override_with_config_ = f "override_ { func . __name__ } _with_config_"
-269
-270 if override_with_config_ is not None :
-271 if os . path . exists ( override_with_config_ ):
-272 toml_kwargs = toml . load ( override_with_config_ )
-273 kwargs = kwargs . update ( ** toml_kwargs )
-274
-275 return func ( * args , ** kwargs )
-276
-277 return override_with
-
-
-
- Configuration decorator. Overwrite a functions kwargs and args with a specified toml config file.
-Pass override_with_config=path/to/config
-
-
Parameters
-
-
-func : Decorated function
-
-
-
Returns
-
-
-
-
-
-
-
-
\ No newline at end of file
diff --git a/docs/debeir/interfaces/converters.html b/docs/debeir/interfaces/converters.html
deleted file mode 100644
index 49d7be1..0000000
--- a/docs/debeir/interfaces/converters.html
+++ /dev/null
@@ -1,443 +0,0 @@
-
-
-
-
-
-
-
debeir.interfaces.converters API documentation
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
- class
- ParsedTopicsToDataset :
-
- View Source
-
-
-
- 9 class ParsedTopicsToDataset :
-10 """
-11 Converts a parser's output to a huggingface dataset object.
-12 """
-13 @classmethod
-14 def convert ( cls , parser : Parser , output : Dict [ Union [ str , int ], Dict ]):
-15 """
-16 Flatten a Dict of shape (traditional parser output)
-17 {topic_id: {
-18 "Facet_1": ...
-19 "Facet_2": ...
-20 }
-21 }
-22
-23 ->
-24
-25 To a flattened arrow-like dataset.
-26 {
-27 topic_ids: [],
-28 Facet_1s: [],
-29 Facet_2s: [],
-30 }
-31
-32 :param output: Topics output from the parser object
-33 :return:
-34 """
-35 flattened_topics = defaultdict ( lambda : [])
-36
-37 for topic_id , topic in output . items ():
-38 flattened_topics [ "topic_id" ] . append ( topic_id )
-39
-40 for field in parser . parse_fields :
-41 if field in topic :
-42 flattened_topics [ field ] . append ( topic [ field ])
-43 else :
-44 flattened_topics [ field ] . append ( None )
-45
-46 return datasets . Dataset . from_dict ( flattened_topics )
-
-
-
- Converts a parser's output to a huggingface dataset object.
-
-
-
-
-
-
- ParsedTopicsToDataset ()
-
-
-
-
-
-
-
-
-
-
-
-
-
13 @classmethod
-14 def convert ( cls , parser : Parser , output : Dict [ Union [ str , int ], Dict ]):
-15 """
-16 Flatten a Dict of shape (traditional parser output)
-17 {topic_id: {
-18 "Facet_1": ...
-19 "Facet_2": ...
-20 }
-21 }
-22
-23 ->
-24
-25 To a flattened arrow-like dataset.
-26 {
-27 topic_ids: [],
-28 Facet_1s: [],
-29 Facet_2s: [],
-30 }
-31
-32 :param output: Topics output from the parser object
-33 :return:
-34 """
-35 flattened_topics = defaultdict ( lambda : [])
-36
-37 for topic_id , topic in output . items ():
-38 flattened_topics [ "topic_id" ] . append ( topic_id )
-39
-40 for field in parser . parse_fields :
-41 if field in topic :
-42 flattened_topics [ field ] . append ( topic [ field ])
-43 else :
-44 flattened_topics [ field ] . append ( None )
-45
-46 return datasets . Dataset . from_dict ( flattened_topics )
-
-
-
-
Flatten a Dict of shape (traditional parser output)
-{topic_id: {
- "Facet_1": ...
- "Facet_2": ...
- }
-}
-
-
->
-
-
To a flattened arrow-like dataset.
-{
-topic_ids: [],
-Facet_1s: [],
-Facet_2s: [],
-}
-
-
Parameters
-
-
-output : Topics output from the parser object
-
-
-
Returns
-
-
-
-
-
-
-
-
\ No newline at end of file
diff --git a/docs/debeir/interfaces/document.html b/docs/debeir/interfaces/document.html
deleted file mode 100644
index 58f8a85..0000000
--- a/docs/debeir/interfaces/document.html
+++ /dev/null
@@ -1,772 +0,0 @@
-
-
-
-
-
-
-
debeir.interfaces.document API documentation
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
- View Source
-
- 1 import abc
- 2 import dataclasses
- 3 from typing import Union , Dict , List
- 4
- 5 from debeir.utils.utils import flatten
- 6
- 7
- 8 @dataclasses . dataclass
- 9 class Document :
- 10 """
- 11 Generic Document class.
- 12 Used as an interface for interacting across multiple indexes with different mappings.
- 13 """
- 14 doc_id : Union [ int , float , str ]
- 15 facets : Dict
- 16 score : Union [ float , int ] = 0.0
- 17
- 18 @classmethod
- 19 @abc . abstractmethod
- 20 def from_results ( cls ) -> List [ 'Document' ]:
- 21 """
- 22 Produces a list of Document objects from raw results returned from the index
- 23 """
- 24 pass
- 25
- 26 def get_document_id ( self ):
- 27 """
- 28 :return:
- 29 self.doc_id
- 30 """
- 31 return self . doc_id
- 32
- 33 def flatten_facets ( self , * args , ** kwargs ):
- 34 """
- 35 Flattens multi-level internal document facets into a single level
- 36 e.g. Doc['Upper']['Lower'] -> Doc['Upper_Lower']
- 37 :param args:
- 38 :param kwargs:
- 39 """
- 40 self . facets = flatten ( self . facets , * args , ** kwargs )
- 41
- 42 @classmethod
- 43 def _get_document_facet ( cls , intermediate_repr , key ):
- 44 return intermediate_repr [ key ]
- 45
- 46 def get_document_facet ( self , key , sep = "_" ):
- 47 """
- 48 Retrieve a document facet
- 49 Works for multidimensional keys or single
- 50 :param key: Facet to retrieve
- 51 :param sep: The seperator for multidimensional key
- 52 :return:
- 53 Returns the document facet given the key (field)
- 54 """
- 55 if sep in key :
- 56 keys = key . split ( sep )
- 57
- 58 intermediate_repr = self . facets
- 59 for key in keys :
- 60 intermediate_repr = self . _get_document_facet ( intermediate_repr , key )
- 61
- 62 return intermediate_repr
- 63
- 64 return self . facets [ key ]
- 65
- 66 def set ( self , doc_id = None , facets = None , score = None , facet = None , facet_value = None ) -> 'Document' :
- 67 """
- 68 Set attributes of the object. Use keyword arguments to do so. Works as a builder class.
- 69 doc.set(doc_id="123").set(facets={"title": "my title"})
- 70 :param doc_id:
- 71 :param facets:
- 72 :param score:
- 73 :param facet:
- 74 :param facet_value:
- 75
- 76 :return:
- 77 Returns document object
- 78 """
- 79 if doc_id is not None :
- 80 self . doc_id = doc_id
- 81
- 82 if facets is not None :
- 83 self . facets = facets
- 84
- 85 if score is not None :
- 86 self . score = score
- 87
- 88 if facet is not None and facet_value is not None :
- 89 self . facets [ facet ] = facet_value
- 90
- 91 return self
- 92
- 93 def _get_trec_format ( self ) -> str :
- 94 """
- 95 Returns TREC format for the document
- 96 :return:
- 97 A trec formatted string
- 98 """
- 99 return f " { self . score } "
-100
-101 @classmethod
-102 def get_trec_format ( cls , ranked_list : List [ 'Document' ], sort = True ):
-103 """
-104 Get the trec format of a list of ranked documents. This function is a generator.
-105
-106 :param ranked_list: A list of Document-type objects
-107 :param sort: Whether to sort the input list in descending order of score.
-108 """
-109
-110 if sort :
-111 ranked_list . sort ( key = lambda doc : doc . score , reverse = True )
-112
-113 for document in ranked_list :
-114 yield document . _get_trec_format ()
-
-
-
-
-
-
-
-
@dataclasses.dataclass
-
-
class
-
Document :
-
-
View Source
-
-
-
- 9 @dataclasses . dataclass
- 10 class Document :
- 11 """
- 12 Generic Document class.
- 13 Used as an interface for interacting across multiple indexes with different mappings.
- 14 """
- 15 doc_id : Union [ int , float , str ]
- 16 facets : Dict
- 17 score : Union [ float , int ] = 0.0
- 18
- 19 @classmethod
- 20 @abc . abstractmethod
- 21 def from_results ( cls ) -> List [ 'Document' ]:
- 22 """
- 23 Produces a list of Document objects from raw results returned from the index
- 24 """
- 25 pass
- 26
- 27 def get_document_id ( self ):
- 28 """
- 29 :return:
- 30 self.doc_id
- 31 """
- 32 return self . doc_id
- 33
- 34 def flatten_facets ( self , * args , ** kwargs ):
- 35 """
- 36 Flattens multi-level internal document facets into a single level
- 37 e.g. Doc['Upper']['Lower'] -> Doc['Upper_Lower']
- 38 :param args:
- 39 :param kwargs:
- 40 """
- 41 self . facets = flatten ( self . facets , * args , ** kwargs )
- 42
- 43 @classmethod
- 44 def _get_document_facet ( cls , intermediate_repr , key ):
- 45 return intermediate_repr [ key ]
- 46
- 47 def get_document_facet ( self , key , sep = "_" ):
- 48 """
- 49 Retrieve a document facet
- 50 Works for multidimensional keys or single
- 51 :param key: Facet to retrieve
- 52 :param sep: The seperator for multidimensional key
- 53 :return:
- 54 Returns the document facet given the key (field)
- 55 """
- 56 if sep in key :
- 57 keys = key . split ( sep )
- 58
- 59 intermediate_repr = self . facets
- 60 for key in keys :
- 61 intermediate_repr = self . _get_document_facet ( intermediate_repr , key )
- 62
- 63 return intermediate_repr
- 64
- 65 return self . facets [ key ]
- 66
- 67 def set ( self , doc_id = None , facets = None , score = None , facet = None , facet_value = None ) -> 'Document' :
- 68 """
- 69 Set attributes of the object. Use keyword arguments to do so. Works as a builder class.
- 70 doc.set(doc_id="123").set(facets={"title": "my title"})
- 71 :param doc_id:
- 72 :param facets:
- 73 :param score:
- 74 :param facet:
- 75 :param facet_value:
- 76
- 77 :return:
- 78 Returns document object
- 79 """
- 80 if doc_id is not None :
- 81 self . doc_id = doc_id
- 82
- 83 if facets is not None :
- 84 self . facets = facets
- 85
- 86 if score is not None :
- 87 self . score = score
- 88
- 89 if facet is not None and facet_value is not None :
- 90 self . facets [ facet ] = facet_value
- 91
- 92 return self
- 93
- 94 def _get_trec_format ( self ) -> str :
- 95 """
- 96 Returns TREC format for the document
- 97 :return:
- 98 A trec formatted string
- 99 """
-100 return f " { self . score } "
-101
-102 @classmethod
-103 def get_trec_format ( cls , ranked_list : List [ 'Document' ], sort = True ):
-104 """
-105 Get the trec format of a list of ranked documents. This function is a generator.
-106
-107 :param ranked_list: A list of Document-type objects
-108 :param sort: Whether to sort the input list in descending order of score.
-109 """
-110
-111 if sort :
-112 ranked_list . sort ( key = lambda doc : doc . score , reverse = True )
-113
-114 for document in ranked_list :
-115 yield document . _get_trec_format ()
-
-
-
- Generic Document class.
-Used as an interface for interacting across multiple indexes with different mappings.
-
-
-
-
-
-
- Document ( doc_id : Union [ int , float , str ] , facets : Dict , score : Union [ float , int ] = 0.0 )
-
-
-
-
-
-
-
-
-
-
-
-
-
19 @classmethod
-20 @abc . abstractmethod
-21 def from_results ( cls ) -> List [ 'Document' ]:
-22 """
-23 Produces a list of Document objects from raw results returned from the index
-24 """
-25 pass
-
-
-
-
Produces a list of Document objects from raw results returned from the index
-
-
-
-
-
-
-
-
- def
- get_document_id (self ):
-
- View Source
-
-
-
-
27 def get_document_id ( self ):
-28 """
-29 :return:
-30 self.doc_id
-31 """
-32 return self . doc_id
-
-
-
-
Returns
-
-
-self.doc_id
-
-
-
-
-
-
-
-
-
-
- def
- flatten_facets (self , * args , ** kwargs ):
-
- View Source
-
-
-
-
34 def flatten_facets ( self , * args , ** kwargs ):
-35 """
-36 Flattens multi-level internal document facets into a single level
-37 e.g. Doc['Upper']['Lower'] -> Doc['Upper_Lower']
-38 :param args:
-39 :param kwargs:
-40 """
-41 self . facets = flatten ( self . facets , * args , ** kwargs )
-
-
-
-
Flattens multi-level internal document facets into a single level
- e.g. Doc['Upper']['Lower'] -> Doc['Upper_Lower']
-
-
Parameters
-
-
-
-
-
-
-
-
-
-
- def
- get_document_facet (self , key , sep = '_' ):
-
- View Source
-
-
-
-
47 def get_document_facet ( self , key , sep = "_" ):
-48 """
-49 Retrieve a document facet
-50 Works for multidimensional keys or single
-51 :param key: Facet to retrieve
-52 :param sep: The seperator for multidimensional key
-53 :return:
-54 Returns the document facet given the key (field)
-55 """
-56 if sep in key :
-57 keys = key . split ( sep )
-58
-59 intermediate_repr = self . facets
-60 for key in keys :
-61 intermediate_repr = self . _get_document_facet ( intermediate_repr , key )
-62
-63 return intermediate_repr
-64
-65 return self . facets [ key ]
-
-
-
-
Retrieve a document facet
-Works for multidimensional keys or single
-
-
Parameters
-
-
-key : Facet to retrieve
-sep : The seperator for multidimensional key
-
-
-
Returns
-
-
-Returns the document facet given the key (field)
-
-
-
-
-
-
-
-
-
-
-
67 def set ( self , doc_id = None , facets = None , score = None , facet = None , facet_value = None ) -> 'Document' :
-68 """
-69 Set attributes of the object. Use keyword arguments to do so. Works as a builder class.
-70 doc.set(doc_id="123").set(facets={"title": "my title"})
-71 :param doc_id:
-72 :param facets:
-73 :param score:
-74 :param facet:
-75 :param facet_value:
-76
-77 :return:
-78 Returns document object
-79 """
-80 if doc_id is not None :
-81 self . doc_id = doc_id
-82
-83 if facets is not None :
-84 self . facets = facets
-85
-86 if score is not None :
-87 self . score = score
-88
-89 if facet is not None and facet_value is not None :
-90 self . facets [ facet ] = facet_value
-91
-92 return self
-
-
-
-
Set attributes of the object. Use keyword arguments to do so. Works as a builder class.
-doc.set(doc_id="123").set(facets={"title": "my title"})
-
-
Parameters
-
-
-doc_id :
-facets :
-score :
-facet :
-facet_value :
-
-
-
Returns
-
-
-Returns document object
-
-
-
-
-
-
-
-
-
-
-
\ No newline at end of file
diff --git a/docs/debeir/interfaces/executor.html b/docs/debeir/interfaces/executor.html
deleted file mode 100644
index a72992a..0000000
--- a/docs/debeir/interfaces/executor.html
+++ /dev/null
@@ -1,943 +0,0 @@
-
-
-
-
-
-
-
debeir.interfaces.executor API documentation
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
- View Source
-
- 1 from typing import Dict , Union , Optional
- 2
- 3 import loguru
- 4 from elasticsearch import AsyncElasticsearch as Elasticsearch
- 5
- 6 from debeir.interfaces.query import GenericElasticsearchQuery
- 7 from debeir.engines.elasticsearch.executor import ElasticsearchExecutor
- 8 from debeir.interfaces.config import NIRConfig , GenericConfig
- 9 from debeir.rankers.transformer_sent_encoder import Encoder
- 10 from debeir.utils.scaler import unpack_elasticsearch_scores
- 11
- 12
- 13 class GenericElasticsearchExecutor ( ElasticsearchExecutor ):
- 14 """
- 15 Generic Executor class for Elasticsearch
- 16 """
- 17 query : GenericElasticsearchQuery
- 18
- 19 def __init__ (
- 20 self ,
- 21 topics : Dict [ Union [ str , int ], Dict [ str , str ]],
- 22 client : Elasticsearch ,
- 23 index_name : str ,
- 24 output_file : str ,
- 25 query : GenericElasticsearchQuery ,
- 26 encoder : Optional [ Encoder ] = None ,
- 27 config = None ,
- 28 * args ,
- 29 ** kwargs ,
- 30 ):
- 31 super () . __init__ (
- 32 topics ,
- 33 client ,
- 34 index_name ,
- 35 output_file ,
- 36 query ,
- 37 encoder ,
- 38 config = config ,
- 39 * args ,
- 40 ** kwargs ,
- 41 )
- 42
- 43 self . query_fns = {
- 44 "query" : self . generate_query ,
- 45 "embedding" : self . generate_embedding_query ,
- 46 }
- 47
- 48 def generate_query ( self , topic_num , best_fields = True , ** kwargs ):
- 49 """
- 50 Generates a standard BM25 query given the topic number
- 51
- 52 :param topic_num: Query topic number to generate
- 53 :param best_fields: Whether to use a curated list of fields
- 54 :param kwargs:
- 55 :return:
- 56 """
- 57 return self . query . generate_query ( topic_num , ** kwargs )
- 58
- 59 #def generate_query_ablation(self, topic_num, **kwargs):
- 60 # return self.query.generate_query_ablation(topic_num)
- 61
- 62 def generate_embedding_query (
- 63 self ,
- 64 topic_num ,
- 65 cosine_weights = None ,
- 66 query_weights = None ,
- 67 norm_weight = 2.15 ,
- 68 automatic_scores = None ,
- 69 ** kwargs ,
- 70 ):
- 71 """
- 72 Executes an NIR-style query with combined scoring.
- 73
- 74 :param topic_num:
- 75 :param cosine_weights:
- 76 :param query_weights:
- 77 :param norm_weight:
- 78 :param automatic_scores:
- 79 :param kwargs:
- 80 :return:
- 81 """
- 82 assert self . encoder is not None or self . config . encoder is not None
- 83
- 84 if "encoder" not in kwargs :
- 85 kwargs [ "encoder" ] = self . encoder
- 86
- 87 return self . query . generate_query_embedding (
- 88 topic_num ,
- 89 cosine_weights = cosine_weights ,
- 90 query_weight = query_weights ,
- 91 norm_weight = norm_weight ,
- 92 automatic_scores = automatic_scores ,
- 93 ** kwargs ,
- 94 )
- 95
- 96 #@apply_config
- 97 async def execute_query (
- 98 self , query = None , return_size : int = None , return_id_only : bool = None ,
- 99 topic_num = None , ablation = False , query_type = None ,
-100 ** kwargs
-101 ):
-102 """
-103 Executes a query using the underlying elasticsearch client.
-104
-105 :param query:
-106 :param topic_num:
-107 :param ablation:
-108 :param query_type:
-109 :param return_size:
-110 :param return_id_only:
-111 :param kwargs:
-112 :return:
-113 """
-114
-115 if ablation :
-116 query_type = "ablation"
-117
-118 assert query is not None or topic_num is not None
-119
-120 if query :
-121 if return_id_only :
-122 # query["fields"] = [self.query.id_mapping]
-123 # query["_source"] = False
-124 query [ "_source" ] = [ self . query . id_mapping ]
-125 res = await self . client . search (
-126 index = self . index_name , body = query , size = return_size
-127 )
-128
-129 return [ query , res ]
-130
-131 if topic_num :
-132 loguru . logger . debug ( query_type )
-133 body = self . query_fns [ query_type ]( topic_num = topic_num , ** kwargs )
-134 if return_id_only :
-135 loguru . logger . debug ( "Skip" )
-136 body [ "_source" ] = [ self . query . id_mapping ]
-137
-138 loguru . logger . debug ( body )
-139 res = await self . client . search (
-140 index = self . index_name , body = body , size = return_size
-141 )
-142
-143 return [ topic_num , res ]
-144
-145 async def run_automatic_adjustment ( self ):
-146 """
-147 Get the normalization constant to be used in NIR-style queries for all topics given an initial
-148 run of BM25 results.
-149 """
-150 loguru . logger . info ( "Running automatic BM25 weight adjustment" )
-151
-152 # Backup variables temporarily
-153 #size = self.return_size
-154 #self.return_size = 1
-155 #self.return_id_only = True
-156 #prev_qt = self.config.query_type
-157 #self.config.query_type = "query"
-158
-159 results = await self . run_all_queries ( query_type = "query" ,
-160 return_results = True ,
-161 return_size = 1 ,
-162 return_id_only = True )
-163
-164 results = unpack_elasticsearch_scores ( results )
-165 self . query . set_bm25_scores ( results )
-166
-167 @classmethod
-168 def build_from_config ( cls , topics : Dict , query_obj : GenericElasticsearchQuery , client ,
-169 config : GenericConfig , nir_config : NIRConfig ):
-170 """
-171 Build an query executor engine from a config file.
-172 """
-173
-174 return cls (
-175 topics = topics ,
-176 client = client ,
-177 config = config ,
-178 index_name = config . index ,
-179 output_file = "" ,
-180 return_size = nir_config . return_size ,
-181 query = query_obj
-182 )
-
-
-
-
-
-
-
-
- 14 class GenericElasticsearchExecutor ( ElasticsearchExecutor ):
- 15 """
- 16 Generic Executor class for Elasticsearch
- 17 """
- 18 query : GenericElasticsearchQuery
- 19
- 20 def __init__ (
- 21 self ,
- 22 topics : Dict [ Union [ str , int ], Dict [ str , str ]],
- 23 client : Elasticsearch ,
- 24 index_name : str ,
- 25 output_file : str ,
- 26 query : GenericElasticsearchQuery ,
- 27 encoder : Optional [ Encoder ] = None ,
- 28 config = None ,
- 29 * args ,
- 30 ** kwargs ,
- 31 ):
- 32 super () . __init__ (
- 33 topics ,
- 34 client ,
- 35 index_name ,
- 36 output_file ,
- 37 query ,
- 38 encoder ,
- 39 config = config ,
- 40 * args ,
- 41 ** kwargs ,
- 42 )
- 43
- 44 self . query_fns = {
- 45 "query" : self . generate_query ,
- 46 "embedding" : self . generate_embedding_query ,
- 47 }
- 48
- 49 def generate_query ( self , topic_num , best_fields = True , ** kwargs ):
- 50 """
- 51 Generates a standard BM25 query given the topic number
- 52
- 53 :param topic_num: Query topic number to generate
- 54 :param best_fields: Whether to use a curated list of fields
- 55 :param kwargs:
- 56 :return:
- 57 """
- 58 return self . query . generate_query ( topic_num , ** kwargs )
- 59
- 60 #def generate_query_ablation(self, topic_num, **kwargs):
- 61 # return self.query.generate_query_ablation(topic_num)
- 62
- 63 def generate_embedding_query (
- 64 self ,
- 65 topic_num ,
- 66 cosine_weights = None ,
- 67 query_weights = None ,
- 68 norm_weight = 2.15 ,
- 69 automatic_scores = None ,
- 70 ** kwargs ,
- 71 ):
- 72 """
- 73 Executes an NIR-style query with combined scoring.
- 74
- 75 :param topic_num:
- 76 :param cosine_weights:
- 77 :param query_weights:
- 78 :param norm_weight:
- 79 :param automatic_scores:
- 80 :param kwargs:
- 81 :return:
- 82 """
- 83 assert self . encoder is not None or self . config . encoder is not None
- 84
- 85 if "encoder" not in kwargs :
- 86 kwargs [ "encoder" ] = self . encoder
- 87
- 88 return self . query . generate_query_embedding (
- 89 topic_num ,
- 90 cosine_weights = cosine_weights ,
- 91 query_weight = query_weights ,
- 92 norm_weight = norm_weight ,
- 93 automatic_scores = automatic_scores ,
- 94 ** kwargs ,
- 95 )
- 96
- 97 #@apply_config
- 98 async def execute_query (
- 99 self , query = None , return_size : int = None , return_id_only : bool = None ,
-100 topic_num = None , ablation = False , query_type = None ,
-101 ** kwargs
-102 ):
-103 """
-104 Executes a query using the underlying elasticsearch client.
-105
-106 :param query:
-107 :param topic_num:
-108 :param ablation:
-109 :param query_type:
-110 :param return_size:
-111 :param return_id_only:
-112 :param kwargs:
-113 :return:
-114 """
-115
-116 if ablation :
-117 query_type = "ablation"
-118
-119 assert query is not None or topic_num is not None
-120
-121 if query :
-122 if return_id_only :
-123 # query["fields"] = [self.query.id_mapping]
-124 # query["_source"] = False
-125 query [ "_source" ] = [ self . query . id_mapping ]
-126 res = await self . client . search (
-127 index = self . index_name , body = query , size = return_size
-128 )
-129
-130 return [ query , res ]
-131
-132 if topic_num :
-133 loguru . logger . debug ( query_type )
-134 body = self . query_fns [ query_type ]( topic_num = topic_num , ** kwargs )
-135 if return_id_only :
-136 loguru . logger . debug ( "Skip" )
-137 body [ "_source" ] = [ self . query . id_mapping ]
-138
-139 loguru . logger . debug ( body )
-140 res = await self . client . search (
-141 index = self . index_name , body = body , size = return_size
-142 )
-143
-144 return [ topic_num , res ]
-145
-146 async def run_automatic_adjustment ( self ):
-147 """
-148 Get the normalization constant to be used in NIR-style queries for all topics given an initial
-149 run of BM25 results.
-150 """
-151 loguru . logger . info ( "Running automatic BM25 weight adjustment" )
-152
-153 # Backup variables temporarily
-154 #size = self.return_size
-155 #self.return_size = 1
-156 #self.return_id_only = True
-157 #prev_qt = self.config.query_type
-158 #self.config.query_type = "query"
-159
-160 results = await self . run_all_queries ( query_type = "query" ,
-161 return_results = True ,
-162 return_size = 1 ,
-163 return_id_only = True )
-164
-165 results = unpack_elasticsearch_scores ( results )
-166 self . query . set_bm25_scores ( results )
-167
-168 @classmethod
-169 def build_from_config ( cls , topics : Dict , query_obj : GenericElasticsearchQuery , client ,
-170 config : GenericConfig , nir_config : NIRConfig ):
-171 """
-172 Build an query executor engine from a config file.
-173 """
-174
-175 return cls (
-176 topics = topics ,
-177 client = client ,
-178 config = config ,
-179 index_name = config . index ,
-180 output_file = "" ,
-181 return_size = nir_config . return_size ,
-182 query = query_obj
-183 )
-
-
-
- Generic Executor class for Elasticsearch
-
-
-
-
-
-
-
-
GenericElasticsearchExecutor ( topics : Dict [ Union [ str , int ], Dict [ str , str ]] , client : elasticsearch . AsyncElasticsearch , index_name : str , output_file : str , query : debeir.interfaces.query.GenericElasticsearchQuery , encoder : Optional [ debeir.rankers.transformer_sent_encoder.Encoder ] = None , config = None , * args , ** kwargs )
-
-
View Source
-
-
-
-
20 def __init__ (
-21 self ,
-22 topics : Dict [ Union [ str , int ], Dict [ str , str ]],
-23 client : Elasticsearch ,
-24 index_name : str ,
-25 output_file : str ,
-26 query : GenericElasticsearchQuery ,
-27 encoder : Optional [ Encoder ] = None ,
-28 config = None ,
-29 * args ,
-30 ** kwargs ,
-31 ):
-32 super () . __init__ (
-33 topics ,
-34 client ,
-35 index_name ,
-36 output_file ,
-37 query ,
-38 encoder ,
-39 config = config ,
-40 * args ,
-41 ** kwargs ,
-42 )
-43
-44 self . query_fns = {
-45 "query" : self . generate_query ,
-46 "embedding" : self . generate_embedding_query ,
-47 }
-
-
-
-
-
-
-
-
-
-
- def
- generate_query (self , topic_num , best_fields = True , ** kwargs ):
-
- View Source
-
-
-
-
49 def generate_query ( self , topic_num , best_fields = True , ** kwargs ):
-50 """
-51 Generates a standard BM25 query given the topic number
-52
-53 :param topic_num: Query topic number to generate
-54 :param best_fields: Whether to use a curated list of fields
-55 :param kwargs:
-56 :return:
-57 """
-58 return self . query . generate_query ( topic_num , ** kwargs )
-
-
-
-
Generates a standard BM25 query given the topic number
-
-
Parameters
-
-
-topic_num : Query topic number to generate
-best_fields : Whether to use a curated list of fields
-kwargs :
-
-
-
Returns
-
-
-
-
-
-
-
-
- def
- generate_embedding_query ( self , topic_num , cosine_weights = None , query_weights = None , norm_weight = 2.15 , automatic_scores = None , ** kwargs ):
-
- View Source
-
-
-
-
63 def generate_embedding_query (
-64 self ,
-65 topic_num ,
-66 cosine_weights = None ,
-67 query_weights = None ,
-68 norm_weight = 2.15 ,
-69 automatic_scores = None ,
-70 ** kwargs ,
-71 ):
-72 """
-73 Executes an NIR-style query with combined scoring.
-74
-75 :param topic_num:
-76 :param cosine_weights:
-77 :param query_weights:
-78 :param norm_weight:
-79 :param automatic_scores:
-80 :param kwargs:
-81 :return:
-82 """
-83 assert self . encoder is not None or self . config . encoder is not None
-84
-85 if "encoder" not in kwargs :
-86 kwargs [ "encoder" ] = self . encoder
-87
-88 return self . query . generate_query_embedding (
-89 topic_num ,
-90 cosine_weights = cosine_weights ,
-91 query_weight = query_weights ,
-92 norm_weight = norm_weight ,
-93 automatic_scores = automatic_scores ,
-94 ** kwargs ,
-95 )
-
-
-
-
Executes an NIR-style query with combined scoring.
-
-
Parameters
-
-
-topic_num :
-cosine_weights :
-query_weights :
-norm_weight :
-automatic_scores :
-kwargs :
-
-
-
Returns
-
-
-
-
-
-
-
-
- async def
- execute_query ( self , query = None , return_size : int = None , return_id_only : bool = None , topic_num = None , ablation = False , query_type = None , ** kwargs ):
-
- View Source
-
-
-
-
98 async def execute_query (
- 99 self , query = None , return_size : int = None , return_id_only : bool = None ,
-100 topic_num = None , ablation = False , query_type = None ,
-101 ** kwargs
-102 ):
-103 """
-104 Executes a query using the underlying elasticsearch client.
-105
-106 :param query:
-107 :param topic_num:
-108 :param ablation:
-109 :param query_type:
-110 :param return_size:
-111 :param return_id_only:
-112 :param kwargs:
-113 :return:
-114 """
-115
-116 if ablation :
-117 query_type = "ablation"
-118
-119 assert query is not None or topic_num is not None
-120
-121 if query :
-122 if return_id_only :
-123 # query["fields"] = [self.query.id_mapping]
-124 # query["_source"] = False
-125 query [ "_source" ] = [ self . query . id_mapping ]
-126 res = await self . client . search (
-127 index = self . index_name , body = query , size = return_size
-128 )
-129
-130 return [ query , res ]
-131
-132 if topic_num :
-133 loguru . logger . debug ( query_type )
-134 body = self . query_fns [ query_type ]( topic_num = topic_num , ** kwargs )
-135 if return_id_only :
-136 loguru . logger . debug ( "Skip" )
-137 body [ "_source" ] = [ self . query . id_mapping ]
-138
-139 loguru . logger . debug ( body )
-140 res = await self . client . search (
-141 index = self . index_name , body = body , size = return_size
-142 )
-143
-144 return [ topic_num , res ]
-
-
-
-
Execute a query given parameters
-
-
Parameters
-
-
-
-
-
-
-
-
-
-
- async def
- run_automatic_adjustment (self ):
-
- View Source
-
-
-
-
146 async def run_automatic_adjustment ( self ):
-147 """
-148 Get the normalization constant to be used in NIR-style queries for all topics given an initial
-149 run of BM25 results.
-150 """
-151 loguru . logger . info ( "Running automatic BM25 weight adjustment" )
-152
-153 # Backup variables temporarily
-154 #size = self.return_size
-155 #self.return_size = 1
-156 #self.return_id_only = True
-157 #prev_qt = self.config.query_type
-158 #self.config.query_type = "query"
-159
-160 results = await self . run_all_queries ( query_type = "query" ,
-161 return_results = True ,
-162 return_size = 1 ,
-163 return_id_only = True )
-164
-165 results = unpack_elasticsearch_scores ( results )
-166 self . query . set_bm25_scores ( results )
-
-
-
-
Get the normalization constant to be used in NIR-style queries for all topics given an initial
-run of BM25 results.
-
-
-
-
-
-
-
-
-
168 @classmethod
-169 def build_from_config ( cls , topics : Dict , query_obj : GenericElasticsearchQuery , client ,
-170 config : GenericConfig , nir_config : NIRConfig ):
-171 """
-172 Build an query executor engine from a config file.
-173 """
-174
-175 return cls (
-176 topics = topics ,
-177 client = client ,
-178 config = config ,
-179 index_name = config . index ,
-180 output_file = "" ,
-181 return_size = nir_config . return_size ,
-182 query = query_obj
-183 )
-
-
-
-
Build an query executor engine from a config file.
-
-
-
-
-
-
Inherited Members
-
-
-
-
-
-
-
-
\ No newline at end of file
diff --git a/docs/debeir/interfaces/indexer.html b/docs/debeir/interfaces/indexer.html
deleted file mode 100644
index 90e8bc9..0000000
--- a/docs/debeir/interfaces/indexer.html
+++ /dev/null
@@ -1,724 +0,0 @@
-
-
-
-
-
-
-
debeir.interfaces.indexer API documentation
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
- View Source
-
- 1 import abc
- 2 import threading
- 3 from queue import Queue
- 4 from typing import List
- 5
- 6 from elasticsearch import Elasticsearch
- 7
- 8 from debeir.rankers.transformer_sent_encoder import Encoder
- 9 from debeir.utils.utils import remove_excess_whitespace
- 10
- 11
- 12 class Indexer :
- 13 def __init__ ( self , client ):
- 14 super () . __init__ ()
- 15 self . client = client
- 16
- 17 @abc . abstractmethod
- 18 def get_field ( self , document , field ):
- 19 pass
- 20
- 21
- 22 class SemanticElasticsearchIndexer ( Indexer , threading . Thread ):
- 23 """
- 24 Create a NIR-style index, with dense field representations with provided sentence encoder
- 25 Assumes you've already indexed to start with.
- 26 """
- 27
- 28 def __init__ ( self , es_client : Elasticsearch , encoder : Encoder , index : str ,
- 29 fields_to_encode : List [ str ], queue : Queue ):
- 30 super () . __init__ ( es_client )
- 31 self . encoder = encoder
- 32 self . index = index
- 33 self . fields = fields_to_encode
- 34 self . q = queue
- 35 self . update_mappings ( self . index , self . fields , self . client )
- 36
- 37 @classmethod
- 38 def update_mappings ( self , index , fields , client : Elasticsearch ):
- 39 mapping = {}
- 40 value = {
- 41 "type" : "dense_vector" ,
- 42 "dims" : 768
- 43 }
- 44
- 45 for field in fields :
- 46 mapping [ field + "_Embedding" ] = value
- 47 mapping [ field + "_Text" ] = { "type" : "text" }
- 48
- 49 client . indices . put_mapping (
- 50 body = {
- 51 "properties" : mapping
- 52 }, index = index )
- 53
- 54 # async def create_index(self, document_itr=None):
- 55 # await self._update_mappings()
- 56
- 57 # if document_itr is None:
- 58 # document_itr = helpers.async_scan(self.es_client, index=self.index)
- 59
- 60 # bar = tqdm(desc="Indexing", total=35_000)
- 61
- 62 # async for document in document_itr:
- 63 # doc = document["_source"]
- 64 # await self.index_document(doc)
- 65
- 66 # bar.update(1)
- 67
- 68 def get_field ( self , document , field ):
- 69 if field not in document :
- 70 return False
- 71
- 72 if "f {field} _Text" in document and document [ "f {field} _Text" ] != 0 :
- 73 return False
- 74
- 75 if 'Textblock' in document [ field ]:
- 76 return remove_excess_whitespace ( document [ field ][ 'Textblock' ])
- 77
- 78 return remove_excess_whitespace ( document [ field ])
- 79
- 80 def index_document ( self , document ):
- 81 update_doc = {}
- 82 doc = document [ "_source" ]
- 83
- 84 for field in self . fields :
- 85 text_field = self . get_field ( doc , field )
- 86
- 87 if text_field :
- 88 embedding = self . encoder . encode ( self . encoder , topic = text_field , disable_cache = True )
- 89 update_doc [ f " { field } _Embedding" ] = embedding
- 90 update_doc [ f " { field } _Text" ] = text_field
- 91
- 92 if update_doc :
- 93 self . client . update ( index = self . index ,
- 94 id = document [ '_id' ],
- 95 doc = update_doc )
- 96
- 97 def run ( self ):
- 98 while not self . q . empty ():
- 99 document = self . q . get ()
-100 self . index_document ( document )
-
-
-
-
-
-
-
-
- class
- Indexer :
-
- View Source
-
-
-
- 13 class Indexer :
-14 def __init__ ( self , client ):
-15 super () . __init__ ()
-16 self . client = client
-17
-18 @abc . abstractmethod
-19 def get_field ( self , document , field ):
-20 pass
-
-
-
-
-
-
-
-
-
- Indexer (client )
-
- View Source
-
-
-
-
14 def __init__ ( self , client ):
-15 super () . __init__ ()
-16 self . client = client
-
-
-
-
-
-
-
-
-
-
@abc.abstractmethod
-
-
def
-
get_field (self , document , field ):
-
-
View Source
-
-
-
-
18 @abc . abstractmethod
-19 def get_field ( self , document , field ):
-20 pass
-
-
-
-
-
-
-
-
-
-
-
-
class
-
SemanticElasticsearchIndexer (Indexer , threading.Thread ):
-
- View Source
-
-
-
- 23 class SemanticElasticsearchIndexer ( Indexer , threading . Thread ):
- 24 """
- 25 Create a NIR-style index, with dense field representations with provided sentence encoder
- 26 Assumes you've already indexed to start with.
- 27 """
- 28
- 29 def __init__ ( self , es_client : Elasticsearch , encoder : Encoder , index : str ,
- 30 fields_to_encode : List [ str ], queue : Queue ):
- 31 super () . __init__ ( es_client )
- 32 self . encoder = encoder
- 33 self . index = index
- 34 self . fields = fields_to_encode
- 35 self . q = queue
- 36 self . update_mappings ( self . index , self . fields , self . client )
- 37
- 38 @classmethod
- 39 def update_mappings ( self , index , fields , client : Elasticsearch ):
- 40 mapping = {}
- 41 value = {
- 42 "type" : "dense_vector" ,
- 43 "dims" : 768
- 44 }
- 45
- 46 for field in fields :
- 47 mapping [ field + "_Embedding" ] = value
- 48 mapping [ field + "_Text" ] = { "type" : "text" }
- 49
- 50 client . indices . put_mapping (
- 51 body = {
- 52 "properties" : mapping
- 53 }, index = index )
- 54
- 55 # async def create_index(self, document_itr=None):
- 56 # await self._update_mappings()
- 57
- 58 # if document_itr is None:
- 59 # document_itr = helpers.async_scan(self.es_client, index=self.index)
- 60
- 61 # bar = tqdm(desc="Indexing", total=35_000)
- 62
- 63 # async for document in document_itr:
- 64 # doc = document["_source"]
- 65 # await self.index_document(doc)
- 66
- 67 # bar.update(1)
- 68
- 69 def get_field ( self , document , field ):
- 70 if field not in document :
- 71 return False
- 72
- 73 if "f {field} _Text" in document and document [ "f {field} _Text" ] != 0 :
- 74 return False
- 75
- 76 if 'Textblock' in document [ field ]:
- 77 return remove_excess_whitespace ( document [ field ][ 'Textblock' ])
- 78
- 79 return remove_excess_whitespace ( document [ field ])
- 80
- 81 def index_document ( self , document ):
- 82 update_doc = {}
- 83 doc = document [ "_source" ]
- 84
- 85 for field in self . fields :
- 86 text_field = self . get_field ( doc , field )
- 87
- 88 if text_field :
- 89 embedding = self . encoder . encode ( self . encoder , topic = text_field , disable_cache = True )
- 90 update_doc [ f " { field } _Embedding" ] = embedding
- 91 update_doc [ f " { field } _Text" ] = text_field
- 92
- 93 if update_doc :
- 94 self . client . update ( index = self . index ,
- 95 id = document [ '_id' ],
- 96 doc = update_doc )
- 97
- 98 def run ( self ):
- 99 while not self . q . empty ():
-100 document = self . q . get ()
-101 self . index_document ( document )
-
-
-
- Create a NIR-style index, with dense field representations with provided sentence encoder
-Assumes you've already indexed to start with.
-
-
-
-
-
-
-
-
29 def __init__ ( self , es_client : Elasticsearch , encoder : Encoder , index : str ,
-30 fields_to_encode : List [ str ], queue : Queue ):
-31 super () . __init__ ( es_client )
-32 self . encoder = encoder
-33 self . index = index
-34 self . fields = fields_to_encode
-35 self . q = queue
-36 self . update_mappings ( self . index , self . fields , self . client )
-
-
-
-
This constructor should always be called with keyword arguments. Arguments are:
-
-
group should be None; reserved for future extension when a ThreadGroup
-class is implemented.
-
-
target is the callable object to be invoked by the run()
-method. Defaults to None, meaning nothing is called.
-
-
name is the thread name. By default, a unique name is constructed of
-the form "Thread-N" where N is a small decimal number.
-
-
args is the argument tuple for the target invocation. Defaults to ().
-
-
kwargs is a dictionary of keyword arguments for the target
-invocation. Defaults to {}.
-
-
If a subclass overrides the constructor, it must make sure to invoke
-the base class constructor (Thread.__init__()) before doing anything
-else to the thread.
-
-
-
-
-
-
-
-
@classmethod
-
-
def
-
update_mappings (self , index , fields , client : elasticsearch . Elasticsearch ):
-
-
View Source
-
-
-
-
38 @classmethod
-39 def update_mappings ( self , index , fields , client : Elasticsearch ):
-40 mapping = {}
-41 value = {
-42 "type" : "dense_vector" ,
-43 "dims" : 768
-44 }
-45
-46 for field in fields :
-47 mapping [ field + "_Embedding" ] = value
-48 mapping [ field + "_Text" ] = { "type" : "text" }
-49
-50 client . indices . put_mapping (
-51 body = {
-52 "properties" : mapping
-53 }, index = index )
-
-
-
-
-
-
-
-
-
-
- def
- get_field (self , document , field ):
-
- View Source
-
-
-
-
69 def get_field ( self , document , field ):
-70 if field not in document :
-71 return False
-72
-73 if "f {field} _Text" in document and document [ "f {field} _Text" ] != 0 :
-74 return False
-75
-76 if 'Textblock' in document [ field ]:
-77 return remove_excess_whitespace ( document [ field ][ 'Textblock' ])
-78
-79 return remove_excess_whitespace ( document [ field ])
-
-
-
-
-
-
-
-
-
-
- def
- index_document (self , document ):
-
- View Source
-
-
-
-
81 def index_document ( self , document ):
-82 update_doc = {}
-83 doc = document [ "_source" ]
-84
-85 for field in self . fields :
-86 text_field = self . get_field ( doc , field )
-87
-88 if text_field :
-89 embedding = self . encoder . encode ( self . encoder , topic = text_field , disable_cache = True )
-90 update_doc [ f " { field } _Embedding" ] = embedding
-91 update_doc [ f " { field } _Text" ] = text_field
-92
-93 if update_doc :
-94 self . client . update ( index = self . index ,
-95 id = document [ '_id' ],
-96 doc = update_doc )
-
-
-
-
-
-
-
-
-
-
- def
- run (self ):
-
- View Source
-
-
-
-
98 def run ( self ):
- 99 while not self . q . empty ():
-100 document = self . q . get ()
-101 self . index_document ( document )
-
-
-
-
Method representing the thread's activity.
-
-
You may override this method in a subclass. The standard run() method
-invokes the callable object passed to the object's constructor as the
-target argument, if any, with sequential and keyword arguments taken
-from the args and kwargs arguments, respectively.
-
-
-
-
-
-
Inherited Members
-
-
threading.Thread
- start
- join
- name
- ident
- is_alive
- daemon
- isDaemon
- setDaemon
- getName
- setName
- native_id
-
-
-
-
-
-
-
-
\ No newline at end of file
diff --git a/docs/debeir/interfaces/parser.html b/docs/debeir/interfaces/parser.html
deleted file mode 100644
index 6065e63..0000000
--- a/docs/debeir/interfaces/parser.html
+++ /dev/null
@@ -1,1079 +0,0 @@
-
-
-
-
-
-
-
debeir.interfaces.parser API documentation
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
- View Source
-
- 1 import abc
- 2 import dataclasses
- 3 from collections import defaultdict
- 4 from dataclasses import dataclass
- 5 import csv
- 6 from typing import Dict , List , Optional
- 7 from xml.etree import ElementTree as ET
- 8 import dill
- 9 import json
- 10
- 11 import loguru
- 12 import pandas as pd
- 13
- 14
- 15 # TODO: Parse fields can come from a config or ID_fields
- 16 # TODO: move _get_topics to private cls method with arguments, and expose get_topics as an instance method.
- 17
- 18
- 19 @dataclass ( init = False )
- 20 class Parser :
- 21 """
- 22 Parser interface
- 23 """
- 24
- 25 parse_fields : List [ str ]
- 26
- 27 @classmethod
- 28 def normalize ( cls , input_dict ) -> Dict :
- 29 """
- 30 Flatten the dictionary, i.e. from Dict[int, Dict] -> Dict[str, str_or_int]
- 31
- 32 :param input_dict:
- 33 :return:
- 34 """
- 35 return pd . io . json . json_normalize ( input_dict ,
- 36 sep = "." ) . to_dict ( orient = 'records' )[ 0 ]
- 37
- 38 def get_topics ( self , path , * args , ** kwargs ):
- 39 """
- 40 Instance method for getting topics, forwards instance self parameters to the _get_topics class method.
- 41 """
- 42
- 43 self_kwargs = vars ( self )
- 44 kwargs . update ( self_kwargs )
- 45
- 46 return self . _get_topics ( path , * args , ** kwargs )
- 47
- 48 @classmethod
- 49 @abc . abstractmethod
- 50 def _get_topics ( cls , path , * args , ** kwargs ) -> Dict [ int , Dict [ str , str ]]:
- 51 raise NotImplementedError
- 52
- 53
- 54 @dataclasses . dataclass ( init = True )
- 55 class PickleParser ( Parser ):
- 56 """
- 57 Load topics from a pickle file
- 58 """
- 59
- 60 @classmethod
- 61 def _get_topics ( cls , path , * args , ** kwargs ) -> Dict [ int , Dict [ str , str ]]:
- 62 return dill . load ( path )
- 63
- 64
- 65 @dataclasses . dataclass ( init = True )
- 66 class XMLParser ( Parser ):
- 67 """
- 68 Load topics from an XML file
- 69 """
- 70 topic_field_name : str
- 71 id_field : str
- 72 parse_fields : List [ str ]
- 73
- 74 @classmethod
- 75 def _recurse_to_child_node ( cls , node : ET . Element , track : List ):
- 76 """
- 77 Helper method to get all children nodes for text extraction in an xml.
- 78
- 79 :param node: Current node
- 80 :param track: List to track nodes
- 81 :return:
- 82 """
- 83 if len ( node . getchildren ()) > 0 :
- 84 for child in node . getchildren ():
- 85 track . append ( cls . _recurse_to_child_node ( child , track ))
- 86
- 87 return node
- 88
- 89 @classmethod
- 90 def unwrap ( cls , doc_dict , key ):
- 91 """
- 92 Converts defaultdict to dict and list of size 1 to just the element
- 93
- 94 :param doc_dict:
- 95 :param key:
- 96 """
- 97 if isinstance ( doc_dict [ key ], defaultdict ):
- 98 doc_dict [ key ] = dict ( doc_dict [ key ])
- 99
-100 for e_key in doc_dict [ key ]:
-101 cls . unwrap ( doc_dict [ key ], e_key )
-102
-103 if isinstance ( doc_dict [ key ], list ):
-104 if len ( doc_dict [ key ]) == 1 :
-105 doc_dict [ key ] = doc_dict [ key ][ 0 ]
-106
-107 def _get_topics ( self , path , * args , ** kwargs ) -> Dict [ int , Dict [ str , str ]]:
-108 all_topics = ET . parse ( path ) . getroot ()
-109 qtopics = {}
-110
-111 for topic in all_topics . findall ( self . topic_field_name ):
-112 _id = topic . attrib [ self . id_field ]
-113 if _id . isnumeric ():
-114 _id = int ( _id )
-115
-116 if self . parse_fields :
-117 temp = {}
-118 for field in self . parse_fields :
-119 try :
-120 temp [ field ] = topic . find ( field ) . text . strip ()
-121 except :
-122 continue
-123
-124 qtopics [ _id ] = temp
-125 else :
-126 # The topic contains the text
-127 qtopics [ _id ] = { "query" : topic . text . strip ()}
-128
-129 return qtopics
-130
-131
-132 @dataclasses . dataclass
-133 class CSVParser ( Parser ):
-134 """
-135 Loads topics from a CSV file
-136 """
-137 id_field = "id"
-138 parse_fields = [ "text" ]
-139
-140 def __init__ ( self , parsed_fields = None ):
-141 if parsed_fields is None :
-142 self . parsed_fields = [ "id" , "text" ]
-143
-144 @classmethod
-145 def _get_topics ( cls , csvfile , dialect = "excel" ,
-146 id_field : str = None ,
-147 parse_fields : List [ str ] = None ,
-148 * args , ** kwargs ) -> Dict [ int , Dict [ str , str ]]:
-149 topics = {}
-150
-151 if isinstance ( csvfile , str ):
-152 csvfile = open ( csvfile , 'rt' )
-153
-154 if id_field is None :
-155 id_field = cls . id_field
-156
-157 if parse_fields is None :
-158 parse_fields = cls . parse_fields
-159
-160 reader = csv . DictReader ( csvfile , dialect = dialect )
-161 for row in reader :
-162 temp = {}
-163
-164 for field in parse_fields :
-165 temp [ field ] = row [ field ]
-166
-167 topics [ row [ id_field ]] = temp
-168
-169 return topics
-170
-171
-172 @dataclasses . dataclass ( init = True )
-173 class TSVParser ( CSVParser ):
-174
-175 @classmethod
-176 def _get_topics ( cls , tsvfile , * args , ** kwargs ) -> Dict [ int , Dict [ str , str ]]:
-177 return CSVParser . _get_topics ( tsvfile , * args , dialect = 'excel-tab' , ** kwargs )
-178
-179
-180 @dataclasses . dataclass ( init = True )
-181 class JsonLinesParser ( Parser ):
-182 """
-183 Loads topics from a jsonl file,
-184 a JSON per line
-185
-186 Provide parse_fields, id_field and whether to ignore full matches on json keys
-187 secondary_id appends to the primary id as jsonlines are flattened structure and may contain duplicate ids.
-188 """
-189 parse_fields : List [ str ]
-190 id_field : str
-191 ignore_full_match : bool = True
-192 secondary_id : str = None
-193
-194 @classmethod
-195 def _get_topics ( cls , jsonlfile , id_field , parse_fields ,
-196 ignore_full_match = True , secondary_id = None , * args , ** kwargs ) -> Dict [ str , Dict ]:
-197 with open ( jsonlfile , "r" ) as jsonl_f :
-198 topics = {}
-199
-200 for jsonl in jsonl_f :
-201 json_dict = json . loads ( jsonl )
-202 _id = json_dict . pop ( id_field )
-203
-204 if secondary_id :
-205 _id = str ( _id ) + "_" + str ( json_dict [ secondary_id ])
-206
-207 for key in list ( json_dict . keys ()):
-208 found = False
-209 for _key in parse_fields :
-210 if ignore_full_match :
-211 if key in _key or key == _key or _key in key :
-212 found = True
-213 else :
-214 if _key == key :
-215 found = True
-216 if not found :
-217 json_dict . pop ( key )
-218
-219 topics [ _id ] = json_dict
-220
-221 return topics
-
-
-
-
-
-
-
-
@dataclass(init=False)
-
-
class
-
Parser :
-
-
View Source
-
-
-
- 20 @dataclass ( init = False )
-21 class Parser :
-22 """
-23 Parser interface
-24 """
-25
-26 parse_fields : List [ str ]
-27
-28 @classmethod
-29 def normalize ( cls , input_dict ) -> Dict :
-30 """
-31 Flatten the dictionary, i.e. from Dict[int, Dict] -> Dict[str, str_or_int]
-32
-33 :param input_dict:
-34 :return:
-35 """
-36 return pd . io . json . json_normalize ( input_dict ,
-37 sep = "." ) . to_dict ( orient = 'records' )[ 0 ]
-38
-39 def get_topics ( self , path , * args , ** kwargs ):
-40 """
-41 Instance method for getting topics, forwards instance self parameters to the _get_topics class method.
-42 """
-43
-44 self_kwargs = vars ( self )
-45 kwargs . update ( self_kwargs )
-46
-47 return self . _get_topics ( path , * args , ** kwargs )
-48
-49 @classmethod
-50 @abc . abstractmethod
-51 def _get_topics ( cls , path , * args , ** kwargs ) -> Dict [ int , Dict [ str , str ]]:
-52 raise NotImplementedError
-
-
-
-
-
-
-
-
-
- Parser ()
-
-
-
-
-
-
-
-
-
-
-
-
@classmethod
-
-
def
-
normalize (cls , input_dict ) -> Dict :
-
-
View Source
-
-
-
-
28 @classmethod
-29 def normalize ( cls , input_dict ) -> Dict :
-30 """
-31 Flatten the dictionary, i.e. from Dict[int, Dict] -> Dict[str, str_or_int]
-32
-33 :param input_dict:
-34 :return:
-35 """
-36 return pd . io . json . json_normalize ( input_dict ,
-37 sep = "." ) . to_dict ( orient = 'records' )[ 0 ]
-
-
-
-
Flatten the dictionary, i.e. from Dict[int, Dict] -> Dict[str, str_or_int]
-
-
Parameters
-
-
-
-
Returns
-
-
-
-
-
-
-
-
- def
- get_topics (self , path , * args , ** kwargs ):
-
- View Source
-
-
-
-
39 def get_topics ( self , path , * args , ** kwargs ):
-40 """
-41 Instance method for getting topics, forwards instance self parameters to the _get_topics class method.
-42 """
-43
-44 self_kwargs = vars ( self )
-45 kwargs . update ( self_kwargs )
-46
-47 return self . _get_topics ( path , * args , ** kwargs )
-
-
-
-
Instance method for getting topics, forwards instance self parameters to the _get_topics class method.
-
-
-
-
-
-
-
-
-
@dataclasses.dataclass(init=True)
-
-
class
-
PickleParser (Parser ):
-
- View Source
-
-
-
- 55 @dataclasses . dataclass ( init = True )
-56 class PickleParser ( Parser ):
-57 """
-58 Load topics from a pickle file
-59 """
-60
-61 @classmethod
-62 def _get_topics ( cls , path , * args , ** kwargs ) -> Dict [ int , Dict [ str , str ]]:
-63 return dill . load ( path )
-
-
-
- Load topics from a pickle file
-
-
-
-
-
-
- PickleParser (parse_fields : List [ str ] )
-
-
-
-
-
-
-
-
-
-
Inherited Members
-
-
-
-
-
-
-
-
-
@dataclasses.dataclass(init=True)
-
-
class
-
XMLParser (Parser ):
-
- View Source
-
-
-
- 66 @dataclasses . dataclass ( init = True )
- 67 class XMLParser ( Parser ):
- 68 """
- 69 Load topics from an XML file
- 70 """
- 71 topic_field_name : str
- 72 id_field : str
- 73 parse_fields : List [ str ]
- 74
- 75 @classmethod
- 76 def _recurse_to_child_node ( cls , node : ET . Element , track : List ):
- 77 """
- 78 Helper method to get all children nodes for text extraction in an xml.
- 79
- 80 :param node: Current node
- 81 :param track: List to track nodes
- 82 :return:
- 83 """
- 84 if len ( node . getchildren ()) > 0 :
- 85 for child in node . getchildren ():
- 86 track . append ( cls . _recurse_to_child_node ( child , track ))
- 87
- 88 return node
- 89
- 90 @classmethod
- 91 def unwrap ( cls , doc_dict , key ):
- 92 """
- 93 Converts defaultdict to dict and list of size 1 to just the element
- 94
- 95 :param doc_dict:
- 96 :param key:
- 97 """
- 98 if isinstance ( doc_dict [ key ], defaultdict ):
- 99 doc_dict [ key ] = dict ( doc_dict [ key ])
-100
-101 for e_key in doc_dict [ key ]:
-102 cls . unwrap ( doc_dict [ key ], e_key )
-103
-104 if isinstance ( doc_dict [ key ], list ):
-105 if len ( doc_dict [ key ]) == 1 :
-106 doc_dict [ key ] = doc_dict [ key ][ 0 ]
-107
-108 def _get_topics ( self , path , * args , ** kwargs ) -> Dict [ int , Dict [ str , str ]]:
-109 all_topics = ET . parse ( path ) . getroot ()
-110 qtopics = {}
-111
-112 for topic in all_topics . findall ( self . topic_field_name ):
-113 _id = topic . attrib [ self . id_field ]
-114 if _id . isnumeric ():
-115 _id = int ( _id )
-116
-117 if self . parse_fields :
-118 temp = {}
-119 for field in self . parse_fields :
-120 try :
-121 temp [ field ] = topic . find ( field ) . text . strip ()
-122 except :
-123 continue
-124
-125 qtopics [ _id ] = temp
-126 else :
-127 # The topic contains the text
-128 qtopics [ _id ] = { "query" : topic . text . strip ()}
-129
-130 return qtopics
-
-
-
- Load topics from an XML file
-
-
-
-
-
-
- XMLParser (parse_fields : List [ str ] , topic_field_name : str , id_field : str )
-
-
-
-
-
-
-
-
-
-
-
-
@classmethod
-
-
def
-
unwrap (cls , doc_dict , key ):
-
-
View Source
-
-
-
-
90 @classmethod
- 91 def unwrap ( cls , doc_dict , key ):
- 92 """
- 93 Converts defaultdict to dict and list of size 1 to just the element
- 94
- 95 :param doc_dict:
- 96 :param key:
- 97 """
- 98 if isinstance ( doc_dict [ key ], defaultdict ):
- 99 doc_dict [ key ] = dict ( doc_dict [ key ])
-100
-101 for e_key in doc_dict [ key ]:
-102 cls . unwrap ( doc_dict [ key ], e_key )
-103
-104 if isinstance ( doc_dict [ key ], list ):
-105 if len ( doc_dict [ key ]) == 1 :
-106 doc_dict [ key ] = doc_dict [ key ][ 0 ]
-
-
-
-
Converts defaultdict to dict and list of size 1 to just the element
-
-
Parameters
-
-
-
-
-
-
-
-
Inherited Members
-
-
-
-
-
-
-
-
-
@dataclasses.dataclass
-
-
class
-
CSVParser (Parser ):
-
- View Source
-
-
-
- 133 @dataclasses . dataclass
-134 class CSVParser ( Parser ):
-135 """
-136 Loads topics from a CSV file
-137 """
-138 id_field = "id"
-139 parse_fields = [ "text" ]
-140
-141 def __init__ ( self , parsed_fields = None ):
-142 if parsed_fields is None :
-143 self . parsed_fields = [ "id" , "text" ]
-144
-145 @classmethod
-146 def _get_topics ( cls , csvfile , dialect = "excel" ,
-147 id_field : str = None ,
-148 parse_fields : List [ str ] = None ,
-149 * args , ** kwargs ) -> Dict [ int , Dict [ str , str ]]:
-150 topics = {}
-151
-152 if isinstance ( csvfile , str ):
-153 csvfile = open ( csvfile , 'rt' )
-154
-155 if id_field is None :
-156 id_field = cls . id_field
-157
-158 if parse_fields is None :
-159 parse_fields = cls . parse_fields
-160
-161 reader = csv . DictReader ( csvfile , dialect = dialect )
-162 for row in reader :
-163 temp = {}
-164
-165 for field in parse_fields :
-166 temp [ field ] = row [ field ]
-167
-168 topics [ row [ id_field ]] = temp
-169
-170 return topics
-
-
-
- Loads topics from a CSV file
-
-
-
-
-
-
-
- CSVParser (parsed_fields = None )
-
- View Source
-
-
-
-
141 def __init__ ( self , parsed_fields = None ):
-142 if parsed_fields is None :
-143 self . parsed_fields = [ "id" , "text" ]
-
-
-
-
-
-
-
-
Inherited Members
-
-
-
-
-
-
-
-
-
@dataclasses.dataclass(init=True)
-
-
class
-
TSVParser (CSVParser ):
-
- View Source
-
-
-
- 173 @dataclasses . dataclass ( init = True )
-174 class TSVParser ( CSVParser ):
-175
-176 @classmethod
-177 def _get_topics ( cls , tsvfile , * args , ** kwargs ) -> Dict [ int , Dict [ str , str ]]:
-178 return CSVParser . _get_topics ( tsvfile , * args , dialect = 'excel-tab' , ** kwargs )
-
-
-
-
-
-
-
-
- TSVParser (parse_fields : List [ str ] )
-
-
-
-
-
-
-
-
-
-
Inherited Members
-
-
-
-
-
-
-
-
-
@dataclasses.dataclass(init=True)
-
-
class
-
JsonLinesParser (Parser ):
-
- View Source
-
-
-
- 181 @dataclasses . dataclass ( init = True )
-182 class JsonLinesParser ( Parser ):
-183 """
-184 Loads topics from a jsonl file,
-185 a JSON per line
-186
-187 Provide parse_fields, id_field and whether to ignore full matches on json keys
-188 secondary_id appends to the primary id as jsonlines are flattened structure and may contain duplicate ids.
-189 """
-190 parse_fields : List [ str ]
-191 id_field : str
-192 ignore_full_match : bool = True
-193 secondary_id : str = None
-194
-195 @classmethod
-196 def _get_topics ( cls , jsonlfile , id_field , parse_fields ,
-197 ignore_full_match = True , secondary_id = None , * args , ** kwargs ) -> Dict [ str , Dict ]:
-198 with open ( jsonlfile , "r" ) as jsonl_f :
-199 topics = {}
-200
-201 for jsonl in jsonl_f :
-202 json_dict = json . loads ( jsonl )
-203 _id = json_dict . pop ( id_field )
-204
-205 if secondary_id :
-206 _id = str ( _id ) + "_" + str ( json_dict [ secondary_id ])
-207
-208 for key in list ( json_dict . keys ()):
-209 found = False
-210 for _key in parse_fields :
-211 if ignore_full_match :
-212 if key in _key or key == _key or _key in key :
-213 found = True
-214 else :
-215 if _key == key :
-216 found = True
-217 if not found :
-218 json_dict . pop ( key )
-219
-220 topics [ _id ] = json_dict
-221
-222 return topics
-
-
-
- Loads topics from a jsonl file,
-a JSON per line
-
-
Provide parse_fields, id_field and whether to ignore full matches on json keys
-secondary_id appends to the primary id as jsonlines are flattened structure and may contain duplicate ids.
-
-
-
-
-
-
- JsonLinesParser ( parse_fields : List [ str ] , id_field : str , ignore_full_match : bool = True , secondary_id : str = None )
-
-
-
-
-
-
-
-
-
-
Inherited Members
-
-
-
-
-
-
-
-
\ No newline at end of file
diff --git a/docs/debeir/interfaces/pipeline.html b/docs/debeir/interfaces/pipeline.html
deleted file mode 100644
index 0764e1b..0000000
--- a/docs/debeir/interfaces/pipeline.html
+++ /dev/null
@@ -1,798 +0,0 @@
-
-
-
-
-
-
-
debeir.interfaces.pipeline API documentation
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
- View Source
-
- 1 import abc
- 2
- 3 import debeir
- 4 from loguru import logger
- 5 from typing import List
- 6
- 7 from debeir.engines.client import Client
- 8 from debeir.data_sets.factory import factory_fn , get_nir_config
- 9 from debeir.interfaces.executor import GenericElasticsearchExecutor
- 10 from debeir.interfaces.config import Config , _NIRMasterConfig
- 11 from debeir.interfaces.config import GenericConfig
- 12
- 13
- 14 class Pipeline :
- 15 pipeline_structure = [ "parser" , "query" , "engine" , "evaluator" ]
- 16 cannot_disable = [ "parser" , "query" , "engine" ]
- 17 callbacks : List [ 'debeir.interfaces.callbacks.Callback' ]
- 18 output_file = None
- 19
- 20 def __init__ ( self , engine : GenericElasticsearchExecutor ,
- 21 metrics_config ,
- 22 engine_config ,
- 23 nir_config ,
- 24 run_config : Config ,
- 25 callbacks = None ):
- 26
- 27 self . engine = engine
- 28 self . run_config = run_config
- 29 self . metrics_config = metrics_config
- 30 self . engine_config = engine_config
- 31 self . nir_config = nir_config
- 32 self . output_file = None
- 33 self . disable = {}
- 34
- 35 if callbacks is None :
- 36 self . callbacks = []
- 37 else :
- 38 self . callbacks = callbacks
- 39
- 40 @classmethod
- 41 def build_from_config ( cls , nir_config_fp , engine , config_fp ) -> 'Pipeline' :
- 42 query_cls , config , parser , executor_cls = factory_fn ( config_fp )
- 43
- 44 nir_config , search_engine_config , metrics_config = get_nir_config ( nir_config_fp ,
- 45 engine = engine ,
- 46 ignore_errors = False )
- 47
- 48 client = Client . build_from_config ( engine , search_engine_config )
- 49 topics = parser . _get_topics ( config . topics_path )
- 50
- 51 query = query_cls ( topics = topics , query_type = config . query_type , config = config )
- 52
- 53 executor = executor_cls . build_from_config (
- 54 topics ,
- 55 query ,
- 56 client . get_client ( engine ),
- 57 config ,
- 58 nir_config
- 59 )
- 60
- 61 return cls (
- 62 executor ,
- 63 metrics_config ,
- 64 search_engine_config ,
- 65 nir_config ,
- 66 config
- 67 )
- 68
- 69 def disable ( self , parts : list ):
- 70 for part in parts :
- 71 if part in self . pipeline_structure and part not in self . cannot_disable :
- 72 self . disable [ part ] = True
- 73 else :
- 74 logger . warning ( f "Cannot disable { part } because it doesn't exist or is integral to the pipeline" )
- 75
- 76 @abc . abstractmethod
- 77 async def run_pipeline ( self , * args ,
- 78 ** kwargs ):
- 79 raise NotImplementedError ()
- 80
- 81
- 82 class NIRPipeline ( Pipeline ):
- 83 run_config : GenericConfig
- 84
- 85 def __init__ ( self , * args , ** kwargs ):
- 86 super () . __init__ ( * args , ** kwargs )
- 87
- 88 async def prehook ( self ):
- 89 if self . run_config . automatic or self . run_config . norm_weight == "automatic" :
- 90 logger . info ( f "Running initial BM25 for query adjustment" )
- 91 await self . engine . run_automatic_adjustment ()
- 92
- 93 async def run_engine ( self , * args , ** kwargs ):
- 94 # Run bm25 nir adjustment
- 95 logger . info ( f "Running { self . run_config . query_type } queries" )
- 96
- 97 return await self . engine . run_all_queries ( * args , return_results = True , ** kwargs )
- 98
- 99 async def posthook ( self , * args , ** kwargs ):
-100 pass
-101
-102 async def run_pipeline ( self , * args , return_results = False , ** kwargs ):
-103 for cb in self . callbacks :
-104 cb . before ( self )
-105
-106 await self . prehook ()
-107 results = await self . run_engine ( * args , ** kwargs )
-108
-109 for cb in self . callbacks :
-110 cb . after ( results )
-111
-112 if return_results :
-113 return results
-114
-115 def register_callback ( self , cb ):
-116 self . callbacks . append ( cb )
-
-
-
-
-
-
-
-
- class
- Pipeline :
-
- View Source
-
-
-
- 15 class Pipeline :
-16 pipeline_structure = [ "parser" , "query" , "engine" , "evaluator" ]
-17 cannot_disable = [ "parser" , "query" , "engine" ]
-18 callbacks : List [ 'debeir.interfaces.callbacks.Callback' ]
-19 output_file = None
-20
-21 def __init__ ( self , engine : GenericElasticsearchExecutor ,
-22 metrics_config ,
-23 engine_config ,
-24 nir_config ,
-25 run_config : Config ,
-26 callbacks = None ):
-27
-28 self . engine = engine
-29 self . run_config = run_config
-30 self . metrics_config = metrics_config
-31 self . engine_config = engine_config
-32 self . nir_config = nir_config
-33 self . output_file = None
-34 self . disable = {}
-35
-36 if callbacks is None :
-37 self . callbacks = []
-38 else :
-39 self . callbacks = callbacks
-40
-41 @classmethod
-42 def build_from_config ( cls , nir_config_fp , engine , config_fp ) -> 'Pipeline' :
-43 query_cls , config , parser , executor_cls = factory_fn ( config_fp )
-44
-45 nir_config , search_engine_config , metrics_config = get_nir_config ( nir_config_fp ,
-46 engine = engine ,
-47 ignore_errors = False )
-48
-49 client = Client . build_from_config ( engine , search_engine_config )
-50 topics = parser . _get_topics ( config . topics_path )
-51
-52 query = query_cls ( topics = topics , query_type = config . query_type , config = config )
-53
-54 executor = executor_cls . build_from_config (
-55 topics ,
-56 query ,
-57 client . get_client ( engine ),
-58 config ,
-59 nir_config
-60 )
-61
-62 return cls (
-63 executor ,
-64 metrics_config ,
-65 search_engine_config ,
-66 nir_config ,
-67 config
-68 )
-69
-70 def disable ( self , parts : list ):
-71 for part in parts :
-72 if part in self . pipeline_structure and part not in self . cannot_disable :
-73 self . disable [ part ] = True
-74 else :
-75 logger . warning ( f "Cannot disable { part } because it doesn't exist or is integral to the pipeline" )
-76
-77 @abc . abstractmethod
-78 async def run_pipeline ( self , * args ,
-79 ** kwargs ):
-80 raise NotImplementedError ()
-
-
-
-
-
-
-
-
-
-
21 def __init__ ( self , engine : GenericElasticsearchExecutor ,
-22 metrics_config ,
-23 engine_config ,
-24 nir_config ,
-25 run_config : Config ,
-26 callbacks = None ):
-27
-28 self . engine = engine
-29 self . run_config = run_config
-30 self . metrics_config = metrics_config
-31 self . engine_config = engine_config
-32 self . nir_config = nir_config
-33 self . output_file = None
-34 self . disable = {}
-35
-36 if callbacks is None :
-37 self . callbacks = []
-38 else :
-39 self . callbacks = callbacks
-
-
-
-
-
-
-
-
-
-
- def
- disable (self , parts : list ):
-
- View Source
-
-
-
-
70 def disable ( self , parts : list ):
-71 for part in parts :
-72 if part in self . pipeline_structure and part not in self . cannot_disable :
-73 self . disable [ part ] = True
-74 else :
-75 logger . warning ( f "Cannot disable { part } because it doesn't exist or is integral to the pipeline" )
-
-
-
-
-
-
-
-
-
-
-
41 @classmethod
-42 def build_from_config ( cls , nir_config_fp , engine , config_fp ) -> 'Pipeline' :
-43 query_cls , config , parser , executor_cls = factory_fn ( config_fp )
-44
-45 nir_config , search_engine_config , metrics_config = get_nir_config ( nir_config_fp ,
-46 engine = engine ,
-47 ignore_errors = False )
-48
-49 client = Client . build_from_config ( engine , search_engine_config )
-50 topics = parser . _get_topics ( config . topics_path )
-51
-52 query = query_cls ( topics = topics , query_type = config . query_type , config = config )
-53
-54 executor = executor_cls . build_from_config (
-55 topics ,
-56 query ,
-57 client . get_client ( engine ),
-58 config ,
-59 nir_config
-60 )
-61
-62 return cls (
-63 executor ,
-64 metrics_config ,
-65 search_engine_config ,
-66 nir_config ,
-67 config
-68 )
-
-
-
-
-
-
-
-
-
-
@abc.abstractmethod
-
-
async def
-
run_pipeline (self , * args , ** kwargs ):
-
-
View Source
-
-
-
-
77 @abc . abstractmethod
-78 async def run_pipeline ( self , * args ,
-79 ** kwargs ):
-80 raise NotImplementedError ()
-
-
-
-
-
-
-
-
-
-
-
-
class
-
NIRPipeline (Pipeline ):
-
- View Source
-
-
-
- 83 class NIRPipeline ( Pipeline ):
- 84 run_config : GenericConfig
- 85
- 86 def __init__ ( self , * args , ** kwargs ):
- 87 super () . __init__ ( * args , ** kwargs )
- 88
- 89 async def prehook ( self ):
- 90 if self . run_config . automatic or self . run_config . norm_weight == "automatic" :
- 91 logger . info ( f "Running initial BM25 for query adjustment" )
- 92 await self . engine . run_automatic_adjustment ()
- 93
- 94 async def run_engine ( self , * args , ** kwargs ):
- 95 # Run bm25 nir adjustment
- 96 logger . info ( f "Running { self . run_config . query_type } queries" )
- 97
- 98 return await self . engine . run_all_queries ( * args , return_results = True , ** kwargs )
- 99
-100 async def posthook ( self , * args , ** kwargs ):
-101 pass
-102
-103 async def run_pipeline ( self , * args , return_results = False , ** kwargs ):
-104 for cb in self . callbacks :
-105 cb . before ( self )
-106
-107 await self . prehook ()
-108 results = await self . run_engine ( * args , ** kwargs )
-109
-110 for cb in self . callbacks :
-111 cb . after ( results )
-112
-113 if return_results :
-114 return results
-115
-116 def register_callback ( self , cb ):
-117 self . callbacks . append ( cb )
-
-
-
-
-
-
-
-
-
- NIRPipeline (* args , ** kwargs )
-
- View Source
-
-
-
-
86 def __init__ ( self , * args , ** kwargs ):
-87 super () . __init__ ( * args , ** kwargs )
-
-
-
-
-
-
-
-
-
-
- async def
- prehook (self ):
-
- View Source
-
-
-
-
89 async def prehook ( self ):
-90 if self . run_config . automatic or self . run_config . norm_weight == "automatic" :
-91 logger . info ( f "Running initial BM25 for query adjustment" )
-92 await self . engine . run_automatic_adjustment ()
-
-
-
-
-
-
-
-
-
-
- async def
- run_engine (self , * args , ** kwargs ):
-
- View Source
-
-
-
-
94 async def run_engine ( self , * args , ** kwargs ):
-95 # Run bm25 nir adjustment
-96 logger . info ( f "Running { self . run_config . query_type } queries" )
-97
-98 return await self . engine . run_all_queries ( * args , return_results = True , ** kwargs )
-
-
-
-
-
-
-
-
-
-
- async def
- posthook (self , * args , ** kwargs ):
-
- View Source
-
-
-
-
100 async def posthook ( self , * args , ** kwargs ):
-101 pass
-
-
-
-
-
-
-
-
-
-
- async def
- run_pipeline (self , * args , return_results = False , ** kwargs ):
-
- View Source
-
-
-
-
103 async def run_pipeline ( self , * args , return_results = False , ** kwargs ):
-104 for cb in self . callbacks :
-105 cb . before ( self )
-106
-107 await self . prehook ()
-108 results = await self . run_engine ( * args , ** kwargs )
-109
-110 for cb in self . callbacks :
-111 cb . after ( results )
-112
-113 if return_results :
-114 return results
-
-
-
-
-
-
-
-
-
-
- def
- register_callback (self , cb ):
-
- View Source
-
-
-
-
116 def register_callback ( self , cb ):
-117 self . callbacks . append ( cb )
-
-
-
-
-
-
-
-
Inherited Members
-
-
-
-
-
-
-
-
\ No newline at end of file
diff --git a/docs/debeir/interfaces/query.html b/docs/debeir/interfaces/query.html
deleted file mode 100644
index 2cf4edf..0000000
--- a/docs/debeir/interfaces/query.html
+++ /dev/null
@@ -1,940 +0,0 @@
-
-
-
-
-
-
-
debeir.interfaces.query API documentation
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
- View Source
-
- 1 import dataclasses
- 2
- 3 import loguru
- 4
- 5 from typing import Dict , Union , Optional
- 6
- 7 from debeir.interfaces.config import apply_config , GenericConfig
- 8 from debeir.engines.elasticsearch.generate_script_score import generate_script
- 9 from debeir.utils.scaler import get_z_value
- 10
- 11
- 12 @dataclasses . dataclass ( init = True )
- 13 class Query :
- 14 """
- 15 A query interface class
- 16 :param topics: Topics that the query will be composed of
- 17 :param config: Config object that contains the settings for querying
- 18 """
- 19 topics : Dict [ int , Dict [ str , str ]]
- 20 config : GenericConfig
- 21
- 22
- 23 class GenericElasticsearchQuery ( Query ):
- 24 """
- 25 A generic elasticsearch query. Contains methods for NIR-style (embedding) queries and normal BM25 queries.
- 26 Requires topics, configs to be included
- 27 """
- 28 id_mapping : str = "Id"
- 29
- 30 def __init__ ( self , topics , config , top_bm25_scores = None , mappings = None , id_mapping = None , * args , ** kwargs ):
- 31 super () . __init__ ( topics , config )
- 32
- 33 if id_mapping is None :
- 34 self . id_mapping = "id"
- 35
- 36 if mappings is None :
- 37 self . mappings = [ "Text" ]
- 38 else :
- 39 self . mappings = mappings
- 40
- 41 self . topics = topics
- 42 self . config = config
- 43 self . query_type = self . config . query_type
- 44
- 45 self . embed_mappings = [ "Text_Embedding" ]
- 46
- 47 self . query_funcs = {
- 48 "query" : self . generate_query ,
- 49 "embedding" : self . generate_query_embedding ,
- 50 }
- 51
- 52 self . top_bm25_scores = top_bm25_scores
- 53
- 54 def _generate_base_query ( self , topic_num ):
- 55 qfield = list ( self . topics [ topic_num ] . keys ())[ 0 ]
- 56 query = self . topics [ topic_num ][ qfield ]
- 57 should = { "should" : []}
- 58
- 59 for i , field in enumerate ( self . mappings ):
- 60 should [ "should" ] . append (
- 61 {
- 62 "match" : {
- 63 f " { field } " : {
- 64 "query" : query ,
- 65 }
- 66 }
- 67 }
- 68 )
- 69
- 70 return qfield , query , should
- 71
- 72 def generate_query ( self , topic_num , * args , ** kwargs ):
- 73 """
- 74 Generates a simple BM25 query based off the query facets. Searches over all the document facets.
- 75 :param topic_num:
- 76 :param args:
- 77 :param kwargs:
- 78 :return:
- 79 """
- 80 _ , _ , should = self . _generate_base_query ( topic_num )
- 81
- 82 query = {
- 83 "query" : {
- 84 "bool" : should ,
- 85 }
- 86 }
- 87
- 88 return query
- 89
- 90 def set_bm25_scores ( self , scores : Dict [ Union [ str , int ], Union [ int , float ]]):
- 91 """
- 92 Sets BM25 scores that are used for NIR-style scoring. The top BM25 score for each topic is used
- 93 for log normalization.
- 94
- 95 Score = log(bm25)/log(z) + embed_score
- 96 :param scores: Top BM25 Scores of the form {topic_num: top_bm25_score}
- 97 """
- 98 self . top_bm25_scores = scores
- 99
-100 def has_bm25_scores ( self ):
-101 """
-102 Checks if BM25 scores have been set
-103 :return:
-104 """
-105 return self . top_bm25_scores is not None
-106
-107 @apply_config
-108 def generate_query_embedding (
-109 self , topic_num , encoder , * args , norm_weight = 2.15 , ablations = False , cosine_ceiling = Optional [ float ],
-110 cosine_offset : float = 1.0 , ** kwargs ):
-111 """
-112 Generates an embedding script score query for Elasticsearch as part of the NIR scoring function.
-113
-114 :param topic_num: The topic number to search for
-115 :param encoder: The encoder that will be used for encoding the topics
-116 :param norm_weight: The BM25 log normalization constant
-117 :param ablations: Whether to execute ablation style queries (i.e. one query facet
-118 or one document facet at a time)
-119 :param cosine_ceiling: Cosine ceiling used for automatic z-log normalization parameter calculation
-120 :param args:
-121 :param kwargs: Pass disable_cache to disable encoder caching
-122 :return:
-123 An elasticsearch script_score query
-124 """
-125
-126 qfields = list ( self . topics [ topic_num ] . keys ())
-127 should = { "should" : []}
-128
-129 if self . has_bm25_scores ():
-130 cosine_ceiling = len ( self . embed_mappings ) * len ( qfields ) if cosine_ceiling is None else cosine_ceiling
-131 norm_weight = get_z_value (
-132 cosine_ceiling = cosine_ceiling ,
-133 bm25_ceiling = self . top_bm25_scores [ topic_num ],
-134 )
-135 loguru . logger . debug ( f "Automatic norm_weight: { norm_weight } " )
-136
-137 params = {
-138 "weights" : [ 1 ] * ( len ( self . embed_mappings ) * len ( self . mappings )),
-139 "offset" : cosine_offset ,
-140 "norm_weight" : norm_weight ,
-141 "disable_bm25" : ablations ,
-142 }
-143
-144 embed_fields = []
-145
-146 for qfield in qfields :
-147 for field in self . mappings :
-148 should [ "should" ] . append (
-149 {
-150 "match" : {
-151 f " { field } " : {
-152 "query" : self . topics [ topic_num ][ qfield ],
-153 }
-154 }
-155 }
-156 )
-157
-158 params [ f " { qfield } _eb" ] = encoder . encode (
-159 encoder , topic = self . topics [ topic_num ][ qfield ]
-160 )
-161 embed_fields . append ( f " { qfield } _eb" )
-162
-163 query = {
-164 "query" : {
-165 "script_score" : {
-166 "query" : {
-167 "bool" : should ,
-168 },
-169 "script" : generate_script (
-170 self . embed_mappings , params , qfields = embed_fields
-171 ),
-172 }
-173 }
-174 }
-175
-176 loguru . logger . debug ( query )
-177 return query
-178
-179 @classmethod
-180 def get_id_mapping ( cls , hit ):
-181 """
-182 Get the document ID
-183
-184 :param hit: The raw document result
-185 :return:
-186 The document's ID
-187 """
-188 return hit [ cls . id_mapping ]
-
-
-
-
-
-
-
-
@dataclasses.dataclass(init=True)
-
-
class
-
Query :
-
-
View Source
-
-
-
- 13 @dataclasses . dataclass ( init = True )
-14 class Query :
-15 """
-16 A query interface class
-17 :param topics: Topics that the query will be composed of
-18 :param config: Config object that contains the settings for querying
-19 """
-20 topics : Dict [ int , Dict [ str , str ]]
-21 config : GenericConfig
-
-
-
- A query interface class
-
-
Parameters
-
-
-topics : Topics that the query will be composed of
-config : Config object that contains the settings for querying
-
-
-
-
-
-
-
-
-
-
-
class
-
GenericElasticsearchQuery (Query ):
-
- View Source
-
-
-
- 24 class GenericElasticsearchQuery ( Query ):
- 25 """
- 26 A generic elasticsearch query. Contains methods for NIR-style (embedding) queries and normal BM25 queries.
- 27 Requires topics, configs to be included
- 28 """
- 29 id_mapping : str = "Id"
- 30
- 31 def __init__ ( self , topics , config , top_bm25_scores = None , mappings = None , id_mapping = None , * args , ** kwargs ):
- 32 super () . __init__ ( topics , config )
- 33
- 34 if id_mapping is None :
- 35 self . id_mapping = "id"
- 36
- 37 if mappings is None :
- 38 self . mappings = [ "Text" ]
- 39 else :
- 40 self . mappings = mappings
- 41
- 42 self . topics = topics
- 43 self . config = config
- 44 self . query_type = self . config . query_type
- 45
- 46 self . embed_mappings = [ "Text_Embedding" ]
- 47
- 48 self . query_funcs = {
- 49 "query" : self . generate_query ,
- 50 "embedding" : self . generate_query_embedding ,
- 51 }
- 52
- 53 self . top_bm25_scores = top_bm25_scores
- 54
- 55 def _generate_base_query ( self , topic_num ):
- 56 qfield = list ( self . topics [ topic_num ] . keys ())[ 0 ]
- 57 query = self . topics [ topic_num ][ qfield ]
- 58 should = { "should" : []}
- 59
- 60 for i , field in enumerate ( self . mappings ):
- 61 should [ "should" ] . append (
- 62 {
- 63 "match" : {
- 64 f " { field } " : {
- 65 "query" : query ,
- 66 }
- 67 }
- 68 }
- 69 )
- 70
- 71 return qfield , query , should
- 72
- 73 def generate_query ( self , topic_num , * args , ** kwargs ):
- 74 """
- 75 Generates a simple BM25 query based off the query facets. Searches over all the document facets.
- 76 :param topic_num:
- 77 :param args:
- 78 :param kwargs:
- 79 :return:
- 80 """
- 81 _ , _ , should = self . _generate_base_query ( topic_num )
- 82
- 83 query = {
- 84 "query" : {
- 85 "bool" : should ,
- 86 }
- 87 }
- 88
- 89 return query
- 90
- 91 def set_bm25_scores ( self , scores : Dict [ Union [ str , int ], Union [ int , float ]]):
- 92 """
- 93 Sets BM25 scores that are used for NIR-style scoring. The top BM25 score for each topic is used
- 94 for log normalization.
- 95
- 96 Score = log(bm25)/log(z) + embed_score
- 97 :param scores: Top BM25 Scores of the form {topic_num: top_bm25_score}
- 98 """
- 99 self . top_bm25_scores = scores
-100
-101 def has_bm25_scores ( self ):
-102 """
-103 Checks if BM25 scores have been set
-104 :return:
-105 """
-106 return self . top_bm25_scores is not None
-107
-108 @apply_config
-109 def generate_query_embedding (
-110 self , topic_num , encoder , * args , norm_weight = 2.15 , ablations = False , cosine_ceiling = Optional [ float ],
-111 cosine_offset : float = 1.0 , ** kwargs ):
-112 """
-113 Generates an embedding script score query for Elasticsearch as part of the NIR scoring function.
-114
-115 :param topic_num: The topic number to search for
-116 :param encoder: The encoder that will be used for encoding the topics
-117 :param norm_weight: The BM25 log normalization constant
-118 :param ablations: Whether to execute ablation style queries (i.e. one query facet
-119 or one document facet at a time)
-120 :param cosine_ceiling: Cosine ceiling used for automatic z-log normalization parameter calculation
-121 :param args:
-122 :param kwargs: Pass disable_cache to disable encoder caching
-123 :return:
-124 An elasticsearch script_score query
-125 """
-126
-127 qfields = list ( self . topics [ topic_num ] . keys ())
-128 should = { "should" : []}
-129
-130 if self . has_bm25_scores ():
-131 cosine_ceiling = len ( self . embed_mappings ) * len ( qfields ) if cosine_ceiling is None else cosine_ceiling
-132 norm_weight = get_z_value (
-133 cosine_ceiling = cosine_ceiling ,
-134 bm25_ceiling = self . top_bm25_scores [ topic_num ],
-135 )
-136 loguru . logger . debug ( f "Automatic norm_weight: { norm_weight } " )
-137
-138 params = {
-139 "weights" : [ 1 ] * ( len ( self . embed_mappings ) * len ( self . mappings )),
-140 "offset" : cosine_offset ,
-141 "norm_weight" : norm_weight ,
-142 "disable_bm25" : ablations ,
-143 }
-144
-145 embed_fields = []
-146
-147 for qfield in qfields :
-148 for field in self . mappings :
-149 should [ "should" ] . append (
-150 {
-151 "match" : {
-152 f " { field } " : {
-153 "query" : self . topics [ topic_num ][ qfield ],
-154 }
-155 }
-156 }
-157 )
-158
-159 params [ f " { qfield } _eb" ] = encoder . encode (
-160 encoder , topic = self . topics [ topic_num ][ qfield ]
-161 )
-162 embed_fields . append ( f " { qfield } _eb" )
-163
-164 query = {
-165 "query" : {
-166 "script_score" : {
-167 "query" : {
-168 "bool" : should ,
-169 },
-170 "script" : generate_script (
-171 self . embed_mappings , params , qfields = embed_fields
-172 ),
-173 }
-174 }
-175 }
-176
-177 loguru . logger . debug ( query )
-178 return query
-179
-180 @classmethod
-181 def get_id_mapping ( cls , hit ):
-182 """
-183 Get the document ID
-184
-185 :param hit: The raw document result
-186 :return:
-187 The document's ID
-188 """
-189 return hit [ cls . id_mapping ]
-
-
-
- A generic elasticsearch query. Contains methods for NIR-style (embedding) queries and normal BM25 queries.
-Requires topics, configs to be included
-
-
-
-
-
-
-
- GenericElasticsearchQuery ( topics , config , top_bm25_scores = None , mappings = None , id_mapping = None , * args , ** kwargs )
-
- View Source
-
-
-
-
31 def __init__ ( self , topics , config , top_bm25_scores = None , mappings = None , id_mapping = None , * args , ** kwargs ):
-32 super () . __init__ ( topics , config )
-33
-34 if id_mapping is None :
-35 self . id_mapping = "id"
-36
-37 if mappings is None :
-38 self . mappings = [ "Text" ]
-39 else :
-40 self . mappings = mappings
-41
-42 self . topics = topics
-43 self . config = config
-44 self . query_type = self . config . query_type
-45
-46 self . embed_mappings = [ "Text_Embedding" ]
-47
-48 self . query_funcs = {
-49 "query" : self . generate_query ,
-50 "embedding" : self . generate_query_embedding ,
-51 }
-52
-53 self . top_bm25_scores = top_bm25_scores
-
-
-
-
-
-
-
-
-
-
- def
- generate_query (self , topic_num , * args , ** kwargs ):
-
- View Source
-
-
-
-
73 def generate_query ( self , topic_num , * args , ** kwargs ):
-74 """
-75 Generates a simple BM25 query based off the query facets. Searches over all the document facets.
-76 :param topic_num:
-77 :param args:
-78 :param kwargs:
-79 :return:
-80 """
-81 _ , _ , should = self . _generate_base_query ( topic_num )
-82
-83 query = {
-84 "query" : {
-85 "bool" : should ,
-86 }
-87 }
-88
-89 return query
-
-
-
-
Generates a simple BM25 query based off the query facets. Searches over all the document facets.
-
-
Parameters
-
-
-topic_num :
-args :
-kwargs :
-
-
-
Returns
-
-
-
-
-
-
-
-
- def
- set_bm25_scores (self , scores : Dict [ Union [ str , int ], Union [ int , float ]] ):
-
- View Source
-
-
-
-
91 def set_bm25_scores ( self , scores : Dict [ Union [ str , int ], Union [ int , float ]]):
-92 """
-93 Sets BM25 scores that are used for NIR-style scoring. The top BM25 score for each topic is used
-94 for log normalization.
-95
-96 Score = log(bm25)/log(z) + embed_score
-97 :param scores: Top BM25 Scores of the form {topic_num: top_bm25_score}
-98 """
-99 self . top_bm25_scores = scores
-
-
-
-
Sets BM25 scores that are used for NIR-style scoring. The top BM25 score for each topic is used
-for log normalization.
-
-
Score = log(bm25)/log(z) + embed_score
-
-
Parameters
-
-
-scores: Top BM25 Scores of the form {topic_num : top_bm25_score}
-
-
-
-
-
-
-
-
-
- def
- has_bm25_scores (self ):
-
- View Source
-
-
-
-
101 def has_bm25_scores ( self ):
-102 """
-103 Checks if BM25 scores have been set
-104 :return:
-105 """
-106 return self . top_bm25_scores is not None
-
-
-
-
Checks if BM25 scores have been set
-
-
Returns
-
-
-
-
-
-
-
-
- def
- generate_query_embedding (self , * args , ** kwargs ):
-
- View Source
-
-
-
-
231 def use_config ( self , * args , ** kwargs ):
-232 """
-233 Replaces keywords and args passed to the function with ones from self.config.
-234
-235 :param self:
-236 :param args: To be updated
-237 :param kwargs: To be updated
-238 :return:
-239 """
-240 if self . config is not None :
-241 kwargs = self . config . __update__ ( ** kwargs )
-242
-243 return func ( self , * args , ** kwargs )
-
-
-
-
Generates an embedding script score query for Elasticsearch as part of the NIR scoring function.
-
-
Parameters
-
-
-topic_num : The topic number to search for
-encoder : The encoder that will be used for encoding the topics
-norm_weight : The BM25 log normalization constant
-ablations : Whether to execute ablation style queries (i.e. one query facet
-or one document facet at a time)
-cosine_ceiling : Cosine ceiling used for automatic z-log normalization parameter calculation
-args :
-kwargs : Pass disable_cache to disable encoder caching
-
-
-
Returns
-
-
-An elasticsearch script_score query
-
-
-
-
-
-
-
-
-
-
@classmethod
-
-
def
-
get_id_mapping (cls , hit ):
-
-
View Source
-
-
-
-
180 @classmethod
-181 def get_id_mapping ( cls , hit ):
-182 """
-183 Get the document ID
-184
-185 :param hit: The raw document result
-186 :return:
-187 The document's ID
-188 """
-189 return hit [ cls . id_mapping ]
-
-
-
-
Get the document ID
-
-
Parameters
-
-
-hit : The raw document result
-
-
-
Returns
-
-
-The document's ID
-
-
-
-
-
-
-
-
-
-
\ No newline at end of file
diff --git a/docs/search.js b/docs/search.js
index 5d635ec..3526cc8 100644
--- a/docs/search.js
+++ b/docs/search.js
@@ -1,6 +1,6 @@
window.pdocSearch = (function(){
/** elasticlunr - http://weixsong.github.io * Copyright (C) 2017 Oliver Nightingale * Copyright (C) 2017 Wei Song * MIT Licensed */!function(){function e(e){if(null===e||"object"!=typeof e)return e;var t=e.constructor();for(var n in e)e.hasOwnProperty(n)&&(t[n]=e[n]);return t}var t=function(e){var n=new t.Index;return n.pipeline.add(t.trimmer,t.stopWordFilter,t.stemmer),e&&e.call(n,n),n};t.version="0.9.5",lunr=t,t.utils={},t.utils.warn=function(e){return function(t){e.console&&console.warn&&console.warn(t)}}(this),t.utils.toString=function(e){return void 0===e||null===e?"":e.toString()},t.EventEmitter=function(){this.events={}},t.EventEmitter.prototype.addListener=function(){var e=Array.prototype.slice.call(arguments),t=e.pop(),n=e;if("function"!=typeof t)throw new TypeError("last argument must be a function");n.forEach(function(e){this.hasHandler(e)||(this.events[e]=[]),this.events[e].push(t)},this)},t.EventEmitter.prototype.removeListener=function(e,t){if(this.hasHandler(e)){var n=this.events[e].indexOf(t);-1!==n&&(this.events[e].splice(n,1),0==this.events[e].length&&delete this.events[e])}},t.EventEmitter.prototype.emit=function(e){if(this.hasHandler(e)){var t=Array.prototype.slice.call(arguments,1);this.events[e].forEach(function(e){e.apply(void 0,t)},this)}},t.EventEmitter.prototype.hasHandler=function(e){return e in this.events},t.tokenizer=function(e){if(!arguments.length||null===e||void 0===e)return[];if(Array.isArray(e)){var n=e.filter(function(e){return null===e||void 0===e?!1:!0});n=n.map(function(e){return t.utils.toString(e).toLowerCase()});var i=[];return n.forEach(function(e){var n=e.split(t.tokenizer.seperator);i=i.concat(n)},this),i}return e.toString().trim().toLowerCase().split(t.tokenizer.seperator)},t.tokenizer.defaultSeperator=/[\s\-]+/,t.tokenizer.seperator=t.tokenizer.defaultSeperator,t.tokenizer.setSeperator=function(e){null!==e&&void 0!==e&&"object"==typeof e&&(t.tokenizer.seperator=e)},t.tokenizer.resetSeperator=function(){t.tokenizer.seperator=t.tokenizer.defaultSeperator},t.tokenizer.getSeperator=function(){return t.tokenizer.seperator},t.Pipeline=function(){this._queue=[]},t.Pipeline.registeredFunctions={},t.Pipeline.registerFunction=function(e,n){n in t.Pipeline.registeredFunctions&&t.utils.warn("Overwriting existing registered function: "+n),e.label=n,t.Pipeline.registeredFunctions[n]=e},t.Pipeline.getRegisteredFunction=function(e){return e in t.Pipeline.registeredFunctions!=!0?null:t.Pipeline.registeredFunctions[e]},t.Pipeline.warnIfFunctionNotRegistered=function(e){var n=e.label&&e.label in this.registeredFunctions;n||t.utils.warn("Function is not registered with pipeline. This may cause problems when serialising the index.\n",e)},t.Pipeline.load=function(e){var n=new t.Pipeline;return e.forEach(function(e){var i=t.Pipeline.getRegisteredFunction(e);if(!i)throw new Error("Cannot load un-registered function: "+e);n.add(i)}),n},t.Pipeline.prototype.add=function(){var e=Array.prototype.slice.call(arguments);e.forEach(function(e){t.Pipeline.warnIfFunctionNotRegistered(e),this._queue.push(e)},this)},t.Pipeline.prototype.after=function(e,n){t.Pipeline.warnIfFunctionNotRegistered(n);var i=this._queue.indexOf(e);if(-1===i)throw new Error("Cannot find existingFn");this._queue.splice(i+1,0,n)},t.Pipeline.prototype.before=function(e,n){t.Pipeline.warnIfFunctionNotRegistered(n);var i=this._queue.indexOf(e);if(-1===i)throw new Error("Cannot find existingFn");this._queue.splice(i,0,n)},t.Pipeline.prototype.remove=function(e){var t=this._queue.indexOf(e);-1!==t&&this._queue.splice(t,1)},t.Pipeline.prototype.run=function(e){for(var t=[],n=e.length,i=this._queue.length,o=0;n>o;o++){for(var r=e[o],s=0;i>s&&(r=this._queue[s](r,o,e),void 0!==r&&null!==r);s++);void 0!==r&&null!==r&&t.push(r)}return t},t.Pipeline.prototype.reset=function(){this._queue=[]},t.Pipeline.prototype.get=function(){return this._queue},t.Pipeline.prototype.toJSON=function(){return this._queue.map(function(e){return t.Pipeline.warnIfFunctionNotRegistered(e),e.label})},t.Index=function(){this._fields=[],this._ref="id",this.pipeline=new t.Pipeline,this.documentStore=new t.DocumentStore,this.index={},this.eventEmitter=new t.EventEmitter,this._idfCache={},this.on("add","remove","update",function(){this._idfCache={}}.bind(this))},t.Index.prototype.on=function(){var e=Array.prototype.slice.call(arguments);return this.eventEmitter.addListener.apply(this.eventEmitter,e)},t.Index.prototype.off=function(e,t){return this.eventEmitter.removeListener(e,t)},t.Index.load=function(e){e.version!==t.version&&t.utils.warn("version mismatch: current "+t.version+" importing "+e.version);var n=new this;n._fields=e.fields,n._ref=e.ref,n.documentStore=t.DocumentStore.load(e.documentStore),n.pipeline=t.Pipeline.load(e.pipeline),n.index={};for(var i in e.index)n.index[i]=t.InvertedIndex.load(e.index[i]);return n},t.Index.prototype.addField=function(e){return this._fields.push(e),this.index[e]=new t.InvertedIndex,this},t.Index.prototype.setRef=function(e){return this._ref=e,this},t.Index.prototype.saveDocument=function(e){return this.documentStore=new t.DocumentStore(e),this},t.Index.prototype.addDoc=function(e,n){if(e){var n=void 0===n?!0:n,i=e[this._ref];this.documentStore.addDoc(i,e),this._fields.forEach(function(n){var o=this.pipeline.run(t.tokenizer(e[n]));this.documentStore.addFieldLength(i,n,o.length);var r={};o.forEach(function(e){e in r?r[e]+=1:r[e]=1},this);for(var s in r){var u=r[s];u=Math.sqrt(u),this.index[n].addToken(s,{ref:i,tf:u})}},this),n&&this.eventEmitter.emit("add",e,this)}},t.Index.prototype.removeDocByRef=function(e){if(e&&this.documentStore.isDocStored()!==!1&&this.documentStore.hasDoc(e)){var t=this.documentStore.getDoc(e);this.removeDoc(t,!1)}},t.Index.prototype.removeDoc=function(e,n){if(e){var n=void 0===n?!0:n,i=e[this._ref];this.documentStore.hasDoc(i)&&(this.documentStore.removeDoc(i),this._fields.forEach(function(n){var o=this.pipeline.run(t.tokenizer(e[n]));o.forEach(function(e){this.index[n].removeToken(e,i)},this)},this),n&&this.eventEmitter.emit("remove",e,this))}},t.Index.prototype.updateDoc=function(e,t){var t=void 0===t?!0:t;this.removeDocByRef(e[this._ref],!1),this.addDoc(e,!1),t&&this.eventEmitter.emit("update",e,this)},t.Index.prototype.idf=function(e,t){var n="@"+t+"/"+e;if(Object.prototype.hasOwnProperty.call(this._idfCache,n))return this._idfCache[n];var i=this.index[t].getDocFreq(e),o=1+Math.log(this.documentStore.length/(i+1));return this._idfCache[n]=o,o},t.Index.prototype.getFields=function(){return this._fields.slice()},t.Index.prototype.search=function(e,n){if(!e)return[];e="string"==typeof e?{any:e}:JSON.parse(JSON.stringify(e));var i=null;null!=n&&(i=JSON.stringify(n));for(var o=new t.Configuration(i,this.getFields()).get(),r={},s=Object.keys(e),u=0;u
0&&t.push(e);for(var i in n)"docs"!==i&&"df"!==i&&this.expandToken(e+i,t,n[i]);return t},t.InvertedIndex.prototype.toJSON=function(){return{root:this.root}},t.Configuration=function(e,n){var e=e||"";if(void 0==n||null==n)throw new Error("fields should not be null");this.config={};var i;try{i=JSON.parse(e),this.buildUserConfig(i,n)}catch(o){t.utils.warn("user configuration parse failed, will use default configuration"),this.buildDefaultConfig(n)}},t.Configuration.prototype.buildDefaultConfig=function(e){this.reset(),e.forEach(function(e){this.config[e]={boost:1,bool:"OR",expand:!1}},this)},t.Configuration.prototype.buildUserConfig=function(e,n){var i="OR",o=!1;if(this.reset(),"bool"in e&&(i=e.bool||i),"expand"in e&&(o=e.expand||o),"fields"in e)for(var r in e.fields)if(n.indexOf(r)>-1){var s=e.fields[r],u=o;void 0!=s.expand&&(u=s.expand),this.config[r]={boost:s.boost||0===s.boost?s.boost:1,bool:s.bool||i,expand:u}}else t.utils.warn("field name in user configuration not found in index instance fields");else this.addAllFields2UserConfig(i,o,n)},t.Configuration.prototype.addAllFields2UserConfig=function(e,t,n){n.forEach(function(n){this.config[n]={boost:1,bool:e,expand:t}},this)},t.Configuration.prototype.get=function(){return this.config},t.Configuration.prototype.reset=function(){this.config={}},lunr.SortedSet=function(){this.length=0,this.elements=[]},lunr.SortedSet.load=function(e){var t=new this;return t.elements=e,t.length=e.length,t},lunr.SortedSet.prototype.add=function(){var e,t;for(e=0;e1;){if(r===e)return o;e>r&&(t=o),r>e&&(n=o),i=n-t,o=t+Math.floor(i/2),r=this.elements[o]}return r===e?o:-1},lunr.SortedSet.prototype.locationFor=function(e){for(var t=0,n=this.elements.length,i=n-t,o=t+Math.floor(i/2),r=this.elements[o];i>1;)e>r&&(t=o),r>e&&(n=o),i=n-t,o=t+Math.floor(i/2),r=this.elements[o];return r>e?o:e>r?o+1:void 0},lunr.SortedSet.prototype.intersect=function(e){for(var t=new lunr.SortedSet,n=0,i=0,o=this.length,r=e.length,s=this.elements,u=e.elements;;){if(n>o-1||i>r-1)break;s[n]!==u[i]?s[n]u[i]&&i++:(t.add(s[n]),n++,i++)}return t},lunr.SortedSet.prototype.clone=function(){var e=new lunr.SortedSet;return e.elements=this.toArray(),e.length=e.elements.length,e},lunr.SortedSet.prototype.union=function(e){var t,n,i;this.length>=e.length?(t=this,n=e):(t=e,n=this),i=t.clone();for(var o=0,r=n.toArray();oThe DeBEIR (Dense Bi-Encoder Information Retrieval) source code library.\n\nSee ./main.py in the parent directory for an out-of-the-box runnable code.
\n\nOtherwise, check out notebooks in the parent directory for training your own model amongst other things.
\n"}, {"fullname": "debeir.core", "modulename": "debeir.core", "kind": "module", "doc": "Core library interfaces that must be implemented for custom datasets
\n\nInterfaces to implement custom data_sets in nir.data_sets.
\n"}, {"fullname": "debeir.core.callbacks", "modulename": "debeir.core.callbacks", "kind": "module", "doc": "Callbacks for before after running.\nE.g. before is for setup\nafter is for evaluation/serialization etc
\n"}, {"fullname": "debeir.core.callbacks.Callback", "modulename": "debeir.core.callbacks", "qualname": "Callback", "kind": "class", "doc": "
\n"}, {"fullname": "debeir.core.callbacks.Callback.__init__", "modulename": "debeir.core.callbacks", "qualname": "Callback.__init__", "kind": "function", "doc": "
\n", "signature": "() "}, {"fullname": "debeir.core.callbacks.Callback.before", "modulename": "debeir.core.callbacks", "qualname": "Callback.before", "kind": "function", "doc": "
\n", "signature": "(self , pipeline : debeir . core . pipeline . Pipeline ): ", "funcdef": "def"}, {"fullname": "debeir.core.callbacks.Callback.after", "modulename": "debeir.core.callbacks", "qualname": "Callback.after", "kind": "function", "doc": "
\n", "signature": "(self , results : List ): ", "funcdef": "def"}, {"fullname": "debeir.core.callbacks.SerializationCallback", "modulename": "debeir.core.callbacks", "qualname": "SerializationCallback", "kind": "class", "doc": "
\n", "bases": "Callback"}, {"fullname": "debeir.core.callbacks.SerializationCallback.__init__", "modulename": "debeir.core.callbacks", "qualname": "SerializationCallback.__init__", "kind": "function", "doc": "
\n", "signature": "(\tconfig : debeir . core . config . GenericConfig , \tnir_config : debeir . core . config . NIRConfig ) "}, {"fullname": "debeir.core.callbacks.SerializationCallback.before", "modulename": "debeir.core.callbacks", "qualname": "SerializationCallback.before", "kind": "function", "doc": "Check if output file exists
\n\nReturns \n\n\nOutput file path\n
\n \n", "signature": "(self , pipeline : debeir . core . pipeline . Pipeline ): ", "funcdef": "def"}, {"fullname": "debeir.core.callbacks.SerializationCallback.after", "modulename": "debeir.core.callbacks", "qualname": "SerializationCallback.after", "kind": "function", "doc": "Serialize results to self.output_file in a TREC-style format
\n\nParameters \n\n\ntopic_num : Topic number to serialize \nres : Raw elasticsearch result \nrun_name: The run name for TREC-style runs (default : NO_RUN_NAME) \n \n", "signature": "(self , results : List ): ", "funcdef": "def"}, {"fullname": "debeir.core.callbacks.EvaluationCallback", "modulename": "debeir.core.callbacks", "qualname": "EvaluationCallback", "kind": "class", "doc": "
\n", "bases": "Callback"}, {"fullname": "debeir.core.callbacks.EvaluationCallback.__init__", "modulename": "debeir.core.callbacks", "qualname": "EvaluationCallback.__init__", "kind": "function", "doc": "
\n", "signature": "(evaluator : debeir . evaluation . evaluator . Evaluator , config ) "}, {"fullname": "debeir.core.callbacks.EvaluationCallback.before", "modulename": "debeir.core.callbacks", "qualname": "EvaluationCallback.before", "kind": "function", "doc": "
\n", "signature": "(self , pipeline : debeir . core . pipeline . Pipeline ): ", "funcdef": "def"}, {"fullname": "debeir.core.callbacks.EvaluationCallback.after", "modulename": "debeir.core.callbacks", "qualname": "EvaluationCallback.after", "kind": "function", "doc": "
\n", "signature": "(self , results : List , id_field = 'id' ): ", "funcdef": "def"}, {"fullname": "debeir.core.config", "modulename": "debeir.core.config", "kind": "module", "doc": "
\n"}, {"fullname": "debeir.core.config.Config", "modulename": "debeir.core.config", "qualname": "Config", "kind": "class", "doc": "Config Interface with creation class methods
\n"}, {"fullname": "debeir.core.config.Config.__init__", "modulename": "debeir.core.config", "qualname": "Config.__init__", "kind": "function", "doc": "
\n", "signature": "() "}, {"fullname": "debeir.core.config.Config.from_toml", "modulename": "debeir.core.config", "qualname": "Config.from_toml", "kind": "function", "doc": "Instantiates a Config object from a toml file
\n\nParameters \n\n\nfp : File path of the Config TOML file \nfield_class : Class of the Config object to be instantiated \nargs : Arguments to be passed to Config \nkwargs : Keyword arguments to be passed \n \n\nReturns \n\n\nA instantiated and validated Config object.\n
\n \n", "signature": "(\tcls , \tfp : Union [ str , pathlib . Path ] , \tfield_class , \t* args , \t** kwargs ) -> debeir . core . config . Config : ", "funcdef": "def"}, {"fullname": "debeir.core.config.Config.from_args", "modulename": "debeir.core.config", "qualname": "Config.from_args", "kind": "function", "doc": "Instantiates a Config object from arguments
\n\nParameters \n\n\nargs_dict : \nfield_class : \nargs : \nkwargs : \n \n\nReturns \n", "signature": "(cls , args_dict : MutableMapping , field_class , * args , ** kwargs ): ", "funcdef": "def"}, {"fullname": "debeir.core.config.Config.from_dict", "modulename": "debeir.core.config", "qualname": "Config.from_dict", "kind": "function", "doc": "Instantiates a Config object from a dictionary
\n\nParameters \n\n\ndata_class : \nkwargs : \n \n\nReturns \n", "signature": "(cls , data_class , ** kwargs ): ", "funcdef": "def"}, {"fullname": "debeir.core.config.Config.validate", "modulename": "debeir.core.config", "qualname": "Config.validate", "kind": "function", "doc": "Validates if the config is correct.\nMust be implemented by inherited classes.
\n", "signature": "(self ): ", "funcdef": "def"}, {"fullname": "debeir.core.config.GenericConfig", "modulename": "debeir.core.config", "qualname": "GenericConfig", "kind": "class", "doc": "Generic NIR Configuration file for which all configs will inherit
\n", "bases": "Config, abc.ABC"}, {"fullname": "debeir.core.config.GenericConfig.__init__", "modulename": "debeir.core.config", "qualname": "GenericConfig.__init__", "kind": "function", "doc": "
\n", "signature": "(\tquery_type : str , \tindex : str = None , \tencoder_normalize : bool = True , \tablations : bool = False , \tnorm_weight : float = None , \tautomatic : bool = None , \tencoder : object = None , \tencoder_fp : str = None , \tquery_weights : List [ float ] = None , \tcosine_weights : List [ float ] = None , \tevaluate : bool = False , \tqrels : str = None , \tconfig_fn : str = None , \tquery_fn : str = None , \tparser_fn : str = None , \texecutor_fn : str = None , \tcosine_ceiling : float = None , \ttopics_path : str = None , \treturn_id_only : bool = False , \toverwrite_output_if_exists : bool = False , \toutput_file : str = None , \trun_name : str = None ) "}, {"fullname": "debeir.core.config.GenericConfig.from_toml", "modulename": "debeir.core.config", "qualname": "GenericConfig.from_toml", "kind": "function", "doc": "Instantiates a Config object from a toml file
\n\nParameters \n\n\nfp : File path of the Config TOML file \nfield_class : Class of the Config object to be instantiated \nargs : Arguments to be passed to Config \nkwargs : Keyword arguments to be passed \n \n\nReturns \n\n\nA instantiated and validated Config object.\n
\n \n", "signature": "(\tcls , \tfp : Union [ str , pathlib . Path ] , \t* args , \t** kwargs ) -> debeir . core . config . GenericConfig : ", "funcdef": "def"}, {"fullname": "debeir.core.config.ElasticsearchConfig", "modulename": "debeir.core.config", "qualname": "ElasticsearchConfig", "kind": "class", "doc": "Basic Elasticsearch configuration file settings from the master nir.toml file
\n", "bases": "Config"}, {"fullname": "debeir.core.config.ElasticsearchConfig.__init__", "modulename": "debeir.core.config", "qualname": "ElasticsearchConfig.__init__", "kind": "function", "doc": "
\n", "signature": "(protocol : str , ip : str , port : str , timeout : int ) "}, {"fullname": "debeir.core.config.ElasticsearchConfig.validate", "modulename": "debeir.core.config", "qualname": "ElasticsearchConfig.validate", "kind": "function", "doc": "Checks if Elasticsearch URL is correct
\n", "signature": "(self ): ", "funcdef": "def"}, {"fullname": "debeir.core.config.SolrConfig", "modulename": "debeir.core.config", "qualname": "SolrConfig", "kind": "class", "doc": "Basic Solr configuration file settings from the master nir.toml file
\n", "bases": "ElasticsearchConfig"}, {"fullname": "debeir.core.config.SolrConfig.__init__", "modulename": "debeir.core.config", "qualname": "SolrConfig.__init__", "kind": "function", "doc": "
\n", "signature": "(protocol : str , ip : str , port : str , timeout : int ) "}, {"fullname": "debeir.core.config.MetricsConfig", "modulename": "debeir.core.config", "qualname": "MetricsConfig", "kind": "class", "doc": "Basic Metrics configuration file settings from the master nir.toml file
\n", "bases": "Config"}, {"fullname": "debeir.core.config.MetricsConfig.__init__", "modulename": "debeir.core.config", "qualname": "MetricsConfig.__init__", "kind": "function", "doc": "
\n", "signature": "(metrics : List [ str ] ) "}, {"fullname": "debeir.core.config.MetricsConfig.validate", "modulename": "debeir.core.config", "qualname": "MetricsConfig.validate", "kind": "function", "doc": "Checks if each Metrics is usable by evaluator classes
\n", "signature": "(self ): ", "funcdef": "def"}, {"fullname": "debeir.core.config.NIRConfig", "modulename": "debeir.core.config", "qualname": "NIRConfig", "kind": "class", "doc": "Basic NIR configuration file settings from the master nir.toml file
\n", "bases": "Config"}, {"fullname": "debeir.core.config.NIRConfig.__init__", "modulename": "debeir.core.config", "qualname": "NIRConfig.__init__", "kind": "function", "doc": "
\n", "signature": "(\tnorm_weight : str , \tevaluate : bool , \treturn_size : int , \toutput_directory : str ) "}, {"fullname": "debeir.core.config.NIRConfig.validate", "modulename": "debeir.core.config", "qualname": "NIRConfig.validate", "kind": "function", "doc": "Validates if the config is correct.\nMust be implemented by inherited classes.
\n", "signature": "(self ): ", "funcdef": "def"}, {"fullname": "debeir.core.config.apply_config", "modulename": "debeir.core.config", "qualname": "apply_config", "kind": "function", "doc": "Configuration decorator.
\n\nParameters \n\n\nfunc : Decorated function \n \n\nReturns \n", "signature": "(func ): ", "funcdef": "def"}, {"fullname": "debeir.core.config.override_with_toml_config", "modulename": "debeir.core.config", "qualname": "override_with_toml_config", "kind": "function", "doc": "Configuration decorator. Overwrite a functions kwargs and args with a specified toml config file.\nPass override_with_config=path/to/config
\n\nParameters \n\n\nfunc : Decorated function \n \n\nReturns \n", "signature": "(func ): ", "funcdef": "def"}, {"fullname": "debeir.core.config.save_kwargs_to_file", "modulename": "debeir.core.config", "qualname": "save_kwargs_to_file", "kind": "function", "doc": "
\n", "signature": "(func ): ", "funcdef": "def"}, {"fullname": "debeir.core.converters", "modulename": "debeir.core.converters", "kind": "module", "doc": "
\n"}, {"fullname": "debeir.core.converters.ParsedTopicsToDataset", "modulename": "debeir.core.converters", "qualname": "ParsedTopicsToDataset", "kind": "class", "doc": "Converts a parser's output to a huggingface dataset object.
\n"}, {"fullname": "debeir.core.converters.ParsedTopicsToDataset.__init__", "modulename": "debeir.core.converters", "qualname": "ParsedTopicsToDataset.__init__", "kind": "function", "doc": "
\n", "signature": "() "}, {"fullname": "debeir.core.converters.ParsedTopicsToDataset.convert", "modulename": "debeir.core.converters", "qualname": "ParsedTopicsToDataset.convert", "kind": "function", "doc": "Flatten a Dict of shape (traditional parser output)\n{topic_id: {\n \"Facet_1\": ...\n \"Facet_2\": ...\n }\n}
\n\n->
\n\nTo a flattened arrow-like dataset.\n{\ntopic_ids: [],\nFacet_1s: [],\nFacet_2s: [],\n}
\n\nParameters \n\n\noutput : Topics output from the parser object \n \n\nReturns \n", "signature": "(\tcls , \tparser : debeir . core . parser . Parser , \toutput : Dict [ Union [ str , int ], Dict ] ): ", "funcdef": "def"}, {"fullname": "debeir.core.document", "modulename": "debeir.core.document", "kind": "module", "doc": "
\n"}, {"fullname": "debeir.core.document.Document", "modulename": "debeir.core.document", "qualname": "Document", "kind": "class", "doc": "Generic Document class.\nUsed as an interface for interacting across multiple indexes with different mappings.
\n"}, {"fullname": "debeir.core.document.Document.__init__", "modulename": "debeir.core.document", "qualname": "Document.__init__", "kind": "function", "doc": "
\n", "signature": "(\tdoc_id : Union [ int , float , str ] , \ttopic_num : Union [ int , str , float ] = None , \tfacets : Dict = None , \tscore : Union [ float , int ] = 0.0 , \tscores : Dict [ str , Union [ float , int ]] = < factory > ) "}, {"fullname": "debeir.core.document.Document.from_results", "modulename": "debeir.core.document", "qualname": "Document.from_results", "kind": "function", "doc": "Produces a list of Document objects from raw results returned from the index
\n\nIn the format {topic_num: [Document, ..., Document]}
\n", "signature": "(\tcls , \tresults , \t* args , \t** kwargs ) -> Dict [ Union [ int , float ], debeir . core . document . Document ] : ", "funcdef": "def"}, {"fullname": "debeir.core.document.Document.get_document_id", "modulename": "debeir.core.document", "qualname": "Document.get_document_id", "kind": "function", "doc": "Returns \n\n\nself.doc_id\n
\n \n", "signature": "(self ): ", "funcdef": "def"}, {"fullname": "debeir.core.document.Document.flatten_facets", "modulename": "debeir.core.document", "qualname": "Document.flatten_facets", "kind": "function", "doc": "Flattens multi-level internal document facets into a single level\n e.g. Doc['Upper']['Lower'] -> Doc['Upper_Lower']
\n\nParameters \n\n\n", "signature": "(self , * args , ** kwargs ): ", "funcdef": "def"}, {"fullname": "debeir.core.document.Document.get_document_facet", "modulename": "debeir.core.document", "qualname": "Document.get_document_facet", "kind": "function", "doc": "Retrieve a document facet\nWorks for multidimensional keys or single
\n\nParameters \n\n\nkey : Facet to retrieve \nsep : The seperator for multidimensional key \n \n\nReturns \n\n\nReturns the document facet given the key (field)\n
\n \n", "signature": "(self , key , sep = '_' ): ", "funcdef": "def"}, {"fullname": "debeir.core.document.Document.set", "modulename": "debeir.core.document", "qualname": "Document.set", "kind": "function", "doc": "Set attributes of the object. Use keyword arguments to do so. Works as a builder class.\ndoc.set(doc_id=\"123\").set(facets={\"title\": \"my title\"})
\n\nParameters \n\n\ndoc_id : \nfacets : \nscore : \nfacet : \nfacet_value : \n \n\nReturns \n\n\nReturns document object\n
\n \n", "signature": "(\tself , \tdoc_id = None , \tfacets = None , \tscore = None , \tfacet = None , \tfacet_value = None ) -> debeir . core . document . Document : ", "funcdef": "def"}, {"fullname": "debeir.core.document.Document.to_trec_format", "modulename": "debeir.core.document", "qualname": "Document.to_trec_format", "kind": "function", "doc": "Returns TREC format for the document
\n\nReturns \n\n\nA trec formatted string\n
\n \n", "signature": "(self , rank , run_name ) -> str : ", "funcdef": "def"}, {"fullname": "debeir.core.document.Document.get_trec_format", "modulename": "debeir.core.document", "qualname": "Document.get_trec_format", "kind": "function", "doc": "Get the trec format of a list of ranked documents. This function is a generator.
\n\nParameters \n\n\nranked_list : A list of Document-type objects \nrun_name : Run name to print in the TREC formatted string \nsort : Whether to sort the input list in descending order of score. \nsorting_func : Custom sorting function will be used if provided \n \n", "signature": "(\tcls , \tranked_list : List [ debeir . core . document . Document ] , \trun_name = 'NO_RUN_NAME' , \tsort = True , \tsorting_func = None ): ", "funcdef": "def"}, {"fullname": "debeir.core.document.ElasticsearchDocument", "modulename": "debeir.core.document", "qualname": "ElasticsearchDocument", "kind": "class", "doc": "Generic Document class.\nUsed as an interface for interacting across multiple indexes with different mappings.
\n", "bases": "Document"}, {"fullname": "debeir.core.document.ElasticsearchDocument.from_results", "modulename": "debeir.core.document", "qualname": "ElasticsearchDocument.from_results", "kind": "function", "doc": "Produces a list of Document objects from raw results returned from the index
\n\nIn the format {topic_num: [Document, ..., Document]}
\n", "signature": "(\tcls , \tresults , \tquery_cls , \tignore_facets = True , \t* args , \t** kwargs ) -> Dict [ Union [ int , float ], debeir . core . document . Document ] : ", "funcdef": "def"}, {"fullname": "debeir.core.executor", "modulename": "debeir.core.executor", "kind": "module", "doc": "
\n"}, {"fullname": "debeir.core.executor.GenericElasticsearchExecutor", "modulename": "debeir.core.executor", "qualname": "GenericElasticsearchExecutor", "kind": "class", "doc": "Generic Executor class for Elasticsearch
\n", "bases": "debeir.engines.elasticsearch.executor.ElasticsearchExecutor"}, {"fullname": "debeir.core.executor.GenericElasticsearchExecutor.__init__", "modulename": "debeir.core.executor", "qualname": "GenericElasticsearchExecutor.__init__", "kind": "function", "doc": "
\n", "signature": "(\ttopics : Dict [ Union [ str , int ], Dict [ str , str ]] , \tclient : elasticsearch . AsyncElasticsearch , \tindex_name : str , \toutput_file : str , \tquery : debeir . core . query . GenericElasticsearchQuery , \tencoder : Optional [ debeir . rankers . transformer_sent_encoder . Encoder ] = None , \tconfig = None , \t* args , \t** kwargs ) "}, {"fullname": "debeir.core.executor.GenericElasticsearchExecutor.generate_query", "modulename": "debeir.core.executor", "qualname": "GenericElasticsearchExecutor.generate_query", "kind": "function", "doc": "Generates a standard BM25 query given the topic number
\n\nParameters \n\n\ntopic_num : Query topic number to generate \nbest_fields : Whether to use a curated list of fields \nkwargs : \n \n\nReturns \n", "signature": "(self , topic_num , best_fields = True , ** kwargs ): ", "funcdef": "def"}, {"fullname": "debeir.core.executor.GenericElasticsearchExecutor.generate_embedding_query", "modulename": "debeir.core.executor", "qualname": "GenericElasticsearchExecutor.generate_embedding_query", "kind": "function", "doc": "Executes an NIR-style query with combined scoring.
\n\nParameters \n\n\ntopic_num : \ncosine_weights : \nquery_weights : \nnorm_weight : \nautomatic_scores : \nkwargs : \n \n\nReturns \n", "signature": "(\tself , \ttopic_num , \tcosine_weights = None , \tquery_weights = None , \tnorm_weight = 2.15 , \tautomatic_scores = None , \t** kwargs ): ", "funcdef": "def"}, {"fullname": "debeir.core.executor.GenericElasticsearchExecutor.execute_query", "modulename": "debeir.core.executor", "qualname": "GenericElasticsearchExecutor.execute_query", "kind": "function", "doc": "Execute a query given parameters
\n\nParameters \n\n\n", "signature": "(\tself , \tquery = None , \treturn_size : int = None , \treturn_id_only : bool = None , \ttopic_num = None , \tablation = False , \tquery_type = None , \t** kwargs ): ", "funcdef": "async def"}, {"fullname": "debeir.core.executor.GenericElasticsearchExecutor.run_automatic_adjustment", "modulename": "debeir.core.executor", "qualname": "GenericElasticsearchExecutor.run_automatic_adjustment", "kind": "function", "doc": "Get the normalization constant to be used in NIR-style queries for all topics given an initial\nrun of BM25 results.
\n", "signature": "(self , return_results = False ): ", "funcdef": "async def"}, {"fullname": "debeir.core.executor.GenericElasticsearchExecutor.build_from_config", "modulename": "debeir.core.executor", "qualname": "GenericElasticsearchExecutor.build_from_config", "kind": "function", "doc": "Build an query executor engine from a config file.
\n", "signature": "(\tcls , \ttopics : Dict , \tquery_obj : debeir . core . query . GenericElasticsearchQuery , \tclient , \tconfig : debeir . core . config . GenericConfig , \tnir_config : debeir . core . config . NIRConfig ): ", "funcdef": "def"}, {"fullname": "debeir.core.indexer", "modulename": "debeir.core.indexer", "kind": "module", "doc": "
\n"}, {"fullname": "debeir.core.indexer.Indexer", "modulename": "debeir.core.indexer", "qualname": "Indexer", "kind": "class", "doc": "
\n"}, {"fullname": "debeir.core.indexer.Indexer.__init__", "modulename": "debeir.core.indexer", "qualname": "Indexer.__init__", "kind": "function", "doc": "
\n", "signature": "(client ) "}, {"fullname": "debeir.core.indexer.Indexer.get_field", "modulename": "debeir.core.indexer", "qualname": "Indexer.get_field", "kind": "function", "doc": "
\n", "signature": "(self , document , field ): ", "funcdef": "def"}, {"fullname": "debeir.core.indexer.SemanticElasticsearchIndexer", "modulename": "debeir.core.indexer", "qualname": "SemanticElasticsearchIndexer", "kind": "class", "doc": "Create a NIR-style index, with dense field representations with provided sentence encoder\nAssumes you've already indexed to start with.
\n", "bases": "Indexer, threading.Thread"}, {"fullname": "debeir.core.indexer.SemanticElasticsearchIndexer.__init__", "modulename": "debeir.core.indexer", "qualname": "SemanticElasticsearchIndexer.__init__", "kind": "function", "doc": "This constructor should always be called with keyword arguments. Arguments are:
\n\ngroup should be None; reserved for future extension when a ThreadGroup\nclass is implemented.
\n\ntarget is the callable object to be invoked by the run()\nmethod. Defaults to None, meaning nothing is called.
\n\nname is the thread name. By default, a unique name is constructed of\nthe form \"Thread-N\" where N is a small decimal number.
\n\nargs is the argument tuple for the target invocation. Defaults to ().
\n\nkwargs is a dictionary of keyword arguments for the target\ninvocation. Defaults to {}.
\n\nIf a subclass overrides the constructor, it must make sure to invoke\nthe base class constructor (Thread.__init__()) before doing anything\nelse to the thread.
\n", "signature": "(\tes_client : elasticsearch . Elasticsearch , \tencoder : debeir . rankers . transformer_sent_encoder . Encoder , \tindex : str , \tfields_to_encode : List [ str ] , \tqueue : queue . Queue ) "}, {"fullname": "debeir.core.indexer.SemanticElasticsearchIndexer.update_mappings", "modulename": "debeir.core.indexer", "qualname": "SemanticElasticsearchIndexer.update_mappings", "kind": "function", "doc": "
\n", "signature": "(self , index , fields , client : elasticsearch . Elasticsearch ): ", "funcdef": "def"}, {"fullname": "debeir.core.indexer.SemanticElasticsearchIndexer.get_field", "modulename": "debeir.core.indexer", "qualname": "SemanticElasticsearchIndexer.get_field", "kind": "function", "doc": "
\n", "signature": "(self , document , field ): ", "funcdef": "def"}, {"fullname": "debeir.core.indexer.SemanticElasticsearchIndexer.index_document", "modulename": "debeir.core.indexer", "qualname": "SemanticElasticsearchIndexer.index_document", "kind": "function", "doc": "
\n", "signature": "(self , document ): ", "funcdef": "def"}, {"fullname": "debeir.core.indexer.SemanticElasticsearchIndexer.run", "modulename": "debeir.core.indexer", "qualname": "SemanticElasticsearchIndexer.run", "kind": "function", "doc": "Method representing the thread's activity.
\n\nYou may override this method in a subclass. The standard run() method\ninvokes the callable object passed to the object's constructor as the\ntarget argument, if any, with sequential and keyword arguments taken\nfrom the args and kwargs arguments, respectively.
\n", "signature": "(self ): ", "funcdef": "def"}, {"fullname": "debeir.core.parser", "modulename": "debeir.core.parser", "kind": "module", "doc": "
\n"}, {"fullname": "debeir.core.parser.Parser", "modulename": "debeir.core.parser", "qualname": "Parser", "kind": "class", "doc": "Parser interface
\n"}, {"fullname": "debeir.core.parser.Parser.__init__", "modulename": "debeir.core.parser", "qualname": "Parser.__init__", "kind": "function", "doc": "
\n", "signature": "(id_field : object , parse_fields : List [ str ] ) "}, {"fullname": "debeir.core.parser.Parser.normalize", "modulename": "debeir.core.parser", "qualname": "Parser.normalize", "kind": "function", "doc": "Flatten the dictionary, i.e. from Dict[int, Dict] -> Dict[str, str_or_int]
\n\nParameters \n\n\n\nReturns \n", "signature": "(cls , input_dict ) -> Dict : ", "funcdef": "def"}, {"fullname": "debeir.core.parser.Parser.get_topics", "modulename": "debeir.core.parser", "qualname": "Parser.get_topics", "kind": "function", "doc": "Instance method for getting topics, forwards instance self parameters to the _get_topics class method.
\n", "signature": "(self , path , * args , ** kwargs ): ", "funcdef": "def"}, {"fullname": "debeir.core.parser.PickleParser", "modulename": "debeir.core.parser", "qualname": "PickleParser", "kind": "class", "doc": "Load topics from a pickle file
\n", "bases": "Parser"}, {"fullname": "debeir.core.parser.PickleParser.__init__", "modulename": "debeir.core.parser", "qualname": "PickleParser.__init__", "kind": "function", "doc": "
\n", "signature": "(id_field : object , parse_fields : List [ str ] ) "}, {"fullname": "debeir.core.parser.XMLParser", "modulename": "debeir.core.parser", "qualname": "XMLParser", "kind": "class", "doc": "Load topics from an XML file
\n", "bases": "Parser"}, {"fullname": "debeir.core.parser.XMLParser.__init__", "modulename": "debeir.core.parser", "qualname": "XMLParser.__init__", "kind": "function", "doc": "
\n", "signature": "(id_field : str , parse_fields : List [ str ] , topic_field_name : str ) "}, {"fullname": "debeir.core.parser.XMLParser.unwrap", "modulename": "debeir.core.parser", "qualname": "XMLParser.unwrap", "kind": "function", "doc": "Converts defaultdict to dict and list of size 1 to just the element
\n\nParameters \n\n\n", "signature": "(cls , doc_dict , key ): ", "funcdef": "def"}, {"fullname": "debeir.core.parser.CSVParser", "modulename": "debeir.core.parser", "qualname": "CSVParser", "kind": "class", "doc": "Loads topics from a CSV file
\n", "bases": "Parser"}, {"fullname": "debeir.core.parser.CSVParser.__init__", "modulename": "debeir.core.parser", "qualname": "CSVParser.__init__", "kind": "function", "doc": "
\n", "signature": "(id_field = None , parse_fields = None ) "}, {"fullname": "debeir.core.parser.TSVParser", "modulename": "debeir.core.parser", "qualname": "TSVParser", "kind": "class", "doc": "
\n", "bases": "CSVParser"}, {"fullname": "debeir.core.parser.TSVParser.__init__", "modulename": "debeir.core.parser", "qualname": "TSVParser.__init__", "kind": "function", "doc": "
\n", "signature": "(id_field : object , parse_fields : List [ str ] ) "}, {"fullname": "debeir.core.parser.JsonLinesParser", "modulename": "debeir.core.parser", "qualname": "JsonLinesParser", "kind": "class", "doc": "Loads topics from a jsonl file,\na JSON per line
\n\nProvide parse_fields, id_field and whether to ignore full matches on json keys\nsecondary_id appends to the primary id as jsonlines are flattened structure and may contain duplicate ids.
\n", "bases": "Parser"}, {"fullname": "debeir.core.parser.JsonLinesParser.__init__", "modulename": "debeir.core.parser", "qualname": "JsonLinesParser.__init__", "kind": "function", "doc": "
\n", "signature": "(\tid_field : str , \tparse_fields : List [ str ] , \tignore_full_match : bool = True , \tsecondary_id : str = None ) "}, {"fullname": "debeir.core.pipeline", "modulename": "debeir.core.pipeline", "kind": "module", "doc": "
\n"}, {"fullname": "debeir.core.pipeline.Pipeline", "modulename": "debeir.core.pipeline", "qualname": "Pipeline", "kind": "class", "doc": "
\n"}, {"fullname": "debeir.core.pipeline.Pipeline.__init__", "modulename": "debeir.core.pipeline", "qualname": "Pipeline.__init__", "kind": "function", "doc": "
\n", "signature": "(\tengine : debeir . core . executor . GenericElasticsearchExecutor , \tengine_name : str , \tmetrics_config , \tengine_config , \tnir_config , \trun_config : debeir . core . config . Config , \tcallbacks = None ) "}, {"fullname": "debeir.core.pipeline.Pipeline.disable", "modulename": "debeir.core.pipeline", "qualname": "Pipeline.disable", "kind": "function", "doc": "
\n", "signature": "(self , parts : list ): ", "funcdef": "def"}, {"fullname": "debeir.core.pipeline.Pipeline.build_from_config", "modulename": "debeir.core.pipeline", "qualname": "Pipeline.build_from_config", "kind": "function", "doc": "
\n", "signature": "(cls , nir_config_fp , engine , config_fp ) -> debeir . core . pipeline . Pipeline : ", "funcdef": "def"}, {"fullname": "debeir.core.pipeline.Pipeline.run_pipeline", "modulename": "debeir.core.pipeline", "qualname": "Pipeline.run_pipeline", "kind": "function", "doc": "
\n", "signature": "(self , * args , ** kwargs ): ", "funcdef": "async def"}, {"fullname": "debeir.core.pipeline.NIRPipeline", "modulename": "debeir.core.pipeline", "qualname": "NIRPipeline", "kind": "class", "doc": "
\n", "bases": "Pipeline"}, {"fullname": "debeir.core.pipeline.NIRPipeline.__init__", "modulename": "debeir.core.pipeline", "qualname": "NIRPipeline.__init__", "kind": "function", "doc": "
\n", "signature": "(* args , ** kwargs ) "}, {"fullname": "debeir.core.pipeline.NIRPipeline.prehook", "modulename": "debeir.core.pipeline", "qualname": "NIRPipeline.prehook", "kind": "function", "doc": "
\n", "signature": "(self ): ", "funcdef": "async def"}, {"fullname": "debeir.core.pipeline.NIRPipeline.run_engine", "modulename": "debeir.core.pipeline", "qualname": "NIRPipeline.run_engine", "kind": "function", "doc": "
\n", "signature": "(self , * args , ** kwargs ): ", "funcdef": "async def"}, {"fullname": "debeir.core.pipeline.NIRPipeline.posthook", "modulename": "debeir.core.pipeline", "qualname": "NIRPipeline.posthook", "kind": "function", "doc": "
\n", "signature": "(self , * args , ** kwargs ): ", "funcdef": "async def"}, {"fullname": "debeir.core.pipeline.NIRPipeline.run_pipeline", "modulename": "debeir.core.pipeline", "qualname": "NIRPipeline.run_pipeline", "kind": "function", "doc": "
\n", "signature": "(self , * args , return_results = False , ** kwargs ): ", "funcdef": "async def"}, {"fullname": "debeir.core.pipeline.NIRPipeline.register_callback", "modulename": "debeir.core.pipeline", "qualname": "NIRPipeline.register_callback", "kind": "function", "doc": "
\n", "signature": "(self , cb ): ", "funcdef": "def"}, {"fullname": "debeir.core.pipeline.BM25Pipeline", "modulename": "debeir.core.pipeline", "qualname": "BM25Pipeline", "kind": "class", "doc": "
\n", "bases": "NIRPipeline"}, {"fullname": "debeir.core.pipeline.BM25Pipeline.run_pipeline", "modulename": "debeir.core.pipeline", "qualname": "BM25Pipeline.run_pipeline", "kind": "function", "doc": "
\n", "signature": "(self , * args , return_results = False , ** kwargs ): ", "funcdef": "async def"}, {"fullname": "debeir.core.query", "modulename": "debeir.core.query", "kind": "module", "doc": "
\n"}, {"fullname": "debeir.core.query.Query", "modulename": "debeir.core.query", "qualname": "Query", "kind": "class", "doc": "A query interface class
\n\nParameters \n\n\ntopics : Topics that the query will be composed of \nconfig : Config object that contains the settings for querying \n \n"}, {"fullname": "debeir.core.query.Query.__init__", "modulename": "debeir.core.query", "qualname": "Query.__init__", "kind": "function", "doc": "
\n", "signature": "(\ttopics : Dict [ int , Dict [ str , str ]] , \tconfig : debeir . core . config . GenericConfig ) "}, {"fullname": "debeir.core.query.GenericElasticsearchQuery", "modulename": "debeir.core.query", "qualname": "GenericElasticsearchQuery", "kind": "class", "doc": "A generic elasticsearch query. Contains methods for NIR-style (embedding) queries and normal BM25 queries.\nRequires topics, configs to be included
\n", "bases": "Query"}, {"fullname": "debeir.core.query.GenericElasticsearchQuery.__init__", "modulename": "debeir.core.query", "qualname": "GenericElasticsearchQuery.__init__", "kind": "function", "doc": "
\n", "signature": "(\ttopics , \tconfig , \ttop_bm25_scores = None , \tmappings = None , \tid_mapping = None , \t* args , \t** kwargs ) "}, {"fullname": "debeir.core.query.GenericElasticsearchQuery.generate_query", "modulename": "debeir.core.query", "qualname": "GenericElasticsearchQuery.generate_query", "kind": "function", "doc": "Generates a simple BM25 query based off the query facets. Searches over all the document facets.
\n\nParameters \n\n\ntopic_num : \nargs : \nkwargs : \n \n\nReturns \n", "signature": "(self , topic_num , * args , ** kwargs ): ", "funcdef": "def"}, {"fullname": "debeir.core.query.GenericElasticsearchQuery.set_bm25_scores", "modulename": "debeir.core.query", "qualname": "GenericElasticsearchQuery.set_bm25_scores", "kind": "function", "doc": "Sets BM25 scores that are used for NIR-style scoring. The top BM25 score for each topic is used\nfor log normalization.
\n\nScore = log(bm25)/log(z) + embed_score
\n\nParameters \n\n\nscores: Top BM25 Scores of the form {topic_num : top_bm25_score} \n \n", "signature": "(self , scores : Dict [ Union [ str , int ], Union [ int , float ]] ): ", "funcdef": "def"}, {"fullname": "debeir.core.query.GenericElasticsearchQuery.has_bm25_scores", "modulename": "debeir.core.query", "qualname": "GenericElasticsearchQuery.has_bm25_scores", "kind": "function", "doc": "Checks if BM25 scores have been set
\n\nReturns \n", "signature": "(self ): ", "funcdef": "def"}, {"fullname": "debeir.core.query.GenericElasticsearchQuery.generate_query_embedding", "modulename": "debeir.core.query", "qualname": "GenericElasticsearchQuery.generate_query_embedding", "kind": "function", "doc": "Generates an embedding script score query for Elasticsearch as part of the NIR scoring function.
\n\nParameters \n\n\ntopic_num : The topic number to search for \nencoder : The encoder that will be used for encoding the topics \nnorm_weight : The BM25 log normalization constant \nablations : Whether to execute ablation style queries (i.e. one query facet\nor one document facet at a time) \ncosine_ceiling : Cosine ceiling used for automatic z-log normalization parameter calculation \nargs : \nkwargs : Pass disable_cache to disable encoder caching \n \n\nReturns \n\n\nAn elasticsearch script_score query\n
\n \n", "signature": "(self , * args , ** kwargs ): ", "funcdef": "def"}, {"fullname": "debeir.core.query.GenericElasticsearchQuery.get_id_mapping", "modulename": "debeir.core.query", "qualname": "GenericElasticsearchQuery.get_id_mapping", "kind": "function", "doc": "Get the document ID
\n\nParameters \n\n\nhit : The raw document result \n \n\nReturns \n\n\nThe document's ID\n
\n \n", "signature": "(cls , hit ): ", "funcdef": "def"}, {"fullname": "debeir.core.results", "modulename": "debeir.core.results", "kind": "module", "doc": "
\n"}, {"fullname": "debeir.core.results.Results", "modulename": "debeir.core.results", "qualname": "Results", "kind": "class", "doc": "
\n"}, {"fullname": "debeir.core.results.Results.__init__", "modulename": "debeir.core.results", "qualname": "Results.__init__", "kind": "function", "doc": "
\n", "signature": "(results : List , query_cls , engine_name ) "}, {"fullname": "debeir.core.results.Results.get_topic_ids", "modulename": "debeir.core.results", "qualname": "Results.get_topic_ids", "kind": "function", "doc": "
\n", "signature": "(self ): ", "funcdef": "def"}, {"fullname": "debeir.datasets", "modulename": "debeir.datasets", "kind": "module", "doc": "Contains data_sets implemented from nir.interfaces
\n\n\nParser (For reading data from files into a Dict object) \nQuery object (Generating queries)\n\nThese query objects can be very lightweight containing only the mappings of the index. \n \n \n"}, {"fullname": "debeir.datasets.bioreddit", "modulename": "debeir.datasets.bioreddit", "kind": "module", "doc": "
\n"}, {"fullname": "debeir.datasets.bioreddit.BioRedditSubmissionParser", "modulename": "debeir.datasets.bioreddit", "qualname": "BioRedditSubmissionParser", "kind": "class", "doc": "Parser for the BioReddit Submission Dataset
\n", "bases": "debeir.core.parser.CSVParser"}, {"fullname": "debeir.datasets.bioreddit.BioRedditSubmissionParser.get_topics", "modulename": "debeir.datasets.bioreddit", "qualname": "BioRedditSubmissionParser.get_topics", "kind": "function", "doc": "Instance method for getting topics, forwards instance self parameters to the _get_topics class method.
\n", "signature": "(cls , csvfile ) -> Dict [ int , Dict [ str , str ]] : ", "funcdef": "def"}, {"fullname": "debeir.datasets.bioreddit.BioRedditCommentParser", "modulename": "debeir.datasets.bioreddit", "qualname": "BioRedditCommentParser", "kind": "class", "doc": "Parser for the BioReddit Comment Dataset
\n", "bases": "debeir.core.parser.CSVParser"}, {"fullname": "debeir.datasets.bioreddit.BioRedditCommentParser.get_topics", "modulename": "debeir.datasets.bioreddit", "qualname": "BioRedditCommentParser.get_topics", "kind": "function", "doc": "Instance method for getting topics, forwards instance self parameters to the _get_topics class method.
\n", "signature": "(cls , csvfile ) -> Dict [ str , Dict [ str , str ]] : ", "funcdef": "def"}, {"fullname": "debeir.datasets.bioreddit.BioRedditElasticsearchQuery", "modulename": "debeir.datasets.bioreddit", "qualname": "BioRedditElasticsearchQuery", "kind": "class", "doc": "Elasticsearch Query object for the BioReddit
\n", "bases": "debeir.core.query.GenericElasticsearchQuery"}, {"fullname": "debeir.datasets.bioreddit.BioRedditElasticsearchQuery.__init__", "modulename": "debeir.datasets.bioreddit", "qualname": "BioRedditElasticsearchQuery.__init__", "kind": "function", "doc": "
\n", "signature": "(topics , config , * args , ** kwargs ) "}, {"fullname": "debeir.datasets.clinical_trials", "modulename": "debeir.datasets.clinical_trials", "kind": "module", "doc": "
\n"}, {"fullname": "debeir.datasets.clinical_trials.TrialsQueryConfig", "modulename": "debeir.datasets.clinical_trials", "qualname": "TrialsQueryConfig", "kind": "class", "doc": "
\n", "bases": "debeir.core.config.GenericConfig"}, {"fullname": "debeir.datasets.clinical_trials.TrialsQueryConfig.__init__", "modulename": "debeir.datasets.clinical_trials", "qualname": "TrialsQueryConfig.__init__", "kind": "function", "doc": "
\n", "signature": "(\tquery_type : str , \tindex : str = None , \tencoder_normalize : bool = True , \tablations : bool = False , \tnorm_weight : float = None , \tautomatic : bool = None , \tencoder : object = None , \tencoder_fp : str = None , \tquery_weights : List [ float ] = None , \tcosine_weights : List [ float ] = None , \tevaluate : bool = False , \tqrels : str = None , \tconfig_fn : str = None , \tquery_fn : str = None , \tparser_fn : str = None , \texecutor_fn : str = None , \tcosine_ceiling : float = None , \ttopics_path : str = None , \treturn_id_only : bool = False , \toverwrite_output_if_exists : bool = False , \toutput_file : str = None , \trun_name : str = None , \tquery_field_usage : str = None , \tembed_field_usage : str = None , \tfields : List [ str ] = None ) "}, {"fullname": "debeir.datasets.clinical_trials.TrialsQueryConfig.validate", "modulename": "debeir.datasets.clinical_trials", "qualname": "TrialsQueryConfig.validate", "kind": "function", "doc": "Checks if query type is included, and checks if an encoder is included for embedding queries
\n", "signature": "(self ): ", "funcdef": "def"}, {"fullname": "debeir.datasets.clinical_trials.TrialsQueryConfig.from_toml", "modulename": "debeir.datasets.clinical_trials", "qualname": "TrialsQueryConfig.from_toml", "kind": "function", "doc": "Instantiates a Config object from a toml file
\n\nParameters \n\n\nfp : File path of the Config TOML file \nfield_class : Class of the Config object to be instantiated \nargs : Arguments to be passed to Config \nkwargs : Keyword arguments to be passed \n \n\nReturns \n\n\nA instantiated and validated Config object.\n
\n \n", "signature": "(cls , fp : str , * args , ** kwargs ) -> debeir . core . config . GenericConfig : ", "funcdef": "def"}, {"fullname": "debeir.datasets.clinical_trials.TrialsQueryConfig.from_dict", "modulename": "debeir.datasets.clinical_trials", "qualname": "TrialsQueryConfig.from_dict", "kind": "function", "doc": "Instantiates a Config object from a dictionary
\n\nParameters \n\n\ndata_class : \nkwargs : \n \n\nReturns \n", "signature": "(cls , ** kwargs ) -> debeir . core . config . GenericConfig : ", "funcdef": "def"}, {"fullname": "debeir.datasets.clinical_trials.TrialsElasticsearchQuery", "modulename": "debeir.datasets.clinical_trials", "qualname": "TrialsElasticsearchQuery", "kind": "class", "doc": "Elasticsearch Query object for the Clinical Trials Index
\n", "bases": "debeir.core.query.GenericElasticsearchQuery"}, {"fullname": "debeir.datasets.clinical_trials.TrialsElasticsearchQuery.__init__", "modulename": "debeir.datasets.clinical_trials", "qualname": "TrialsElasticsearchQuery.__init__", "kind": "function", "doc": "
\n", "signature": "(topics , query_type , config = None , * args , ** kwargs ) "}, {"fullname": "debeir.datasets.clinical_trials.TrialsElasticsearchQuery.generate_query", "modulename": "debeir.datasets.clinical_trials", "qualname": "TrialsElasticsearchQuery.generate_query", "kind": "function", "doc": "Generates a query for the clinical trials index
\n\nParameters \n\n\ntopic_num : Topic number to search \nquery_field_usage : Which document facets to search over \nkwargs : \n \n\nReturns \n\n\nA basic elasticsearch query for clinical trials\n
\n \n", "signature": "(self , * args , ** kwargs ): ", "funcdef": "def"}, {"fullname": "debeir.datasets.clinical_trials.TrialsElasticsearchQuery.generate_query_ablation", "modulename": "debeir.datasets.clinical_trials", "qualname": "TrialsElasticsearchQuery.generate_query_ablation", "kind": "function", "doc": "Only search one document facet at a time
\n\nParameters \n\n\ntopic_num : \nkwargs : \n \n\nReturns \n", "signature": "(self , topic_num , ** kwargs ): ", "funcdef": "def"}, {"fullname": "debeir.datasets.clinical_trials.TrialsElasticsearchQuery.generate_query_embedding", "modulename": "debeir.datasets.clinical_trials", "qualname": "TrialsElasticsearchQuery.generate_query_embedding", "kind": "function", "doc": "Computes the NIR score for a given topic
\n\nScore = log(BM25)/log(norm_weight) + embedding_score
\n\nParameters \n\n\ntopic_num : \nencoder : \nquery_field_usage : \nembed_field_usage : \ncosine_weights : \nquery_weight : \nnorm_weight : \nablations : \nautomatic_scores : \nkwargs : \n \n\nReturns \n", "signature": "(self , * args , ** kwargs ): ", "funcdef": "def"}, {"fullname": "debeir.datasets.clinical_trials.TrialsElasticsearchQuery.get_query_type", "modulename": "debeir.datasets.clinical_trials", "qualname": "TrialsElasticsearchQuery.get_query_type", "kind": "function", "doc": "
\n", "signature": "(self , * args , ** kwargs ): ", "funcdef": "def"}, {"fullname": "debeir.datasets.clinical_trials.TrialsElasticsearchQuery.get_id_mapping", "modulename": "debeir.datasets.clinical_trials", "qualname": "TrialsElasticsearchQuery.get_id_mapping", "kind": "function", "doc": "Get the document ID
\n\nParameters \n\n\nhit : The raw document result \n \n\nReturns \n\n\nThe document's ID\n
\n \n", "signature": "(self , hit ): ", "funcdef": "def"}, {"fullname": "debeir.datasets.clinical_trials.ClinicalTrialsElasticsearchExecutor", "modulename": "debeir.datasets.clinical_trials", "qualname": "ClinicalTrialsElasticsearchExecutor", "kind": "class", "doc": "Executes queries given a query object.
\n", "bases": "debeir.core.executor.GenericElasticsearchExecutor"}, {"fullname": "debeir.datasets.clinical_trials.ClinicalTrialsElasticsearchExecutor.__init__", "modulename": "debeir.datasets.clinical_trials", "qualname": "ClinicalTrialsElasticsearchExecutor.__init__", "kind": "function", "doc": "
\n", "signature": "(\ttopics : Dict [ Union [ str , int ], Dict [ str , str ]] , \tclient : elasticsearch . AsyncElasticsearch , \tindex_name : str , \toutput_file : str , \tquery : debeir . datasets . clinical_trials . TrialsElasticsearchQuery , \tencoder : Optional [ debeir . rankers . transformer_sent_encoder . Encoder ] = None , \tconfig = None , \t* args , \t** kwargs ) "}, {"fullname": "debeir.datasets.clinical_trials.ClinicalTrialParser", "modulename": "debeir.datasets.clinical_trials", "qualname": "ClinicalTrialParser", "kind": "class", "doc": "Parser for Clinical Trials topics
\n", "bases": "debeir.core.parser.Parser"}, {"fullname": "debeir.datasets.clinical_trials.ClinicalTrialParser.get_topics", "modulename": "debeir.datasets.clinical_trials", "qualname": "ClinicalTrialParser.get_topics", "kind": "function", "doc": "Instance method for getting topics, forwards instance self parameters to the _get_topics class method.
\n", "signature": "(cls , csvfile ) -> Dict [ int , Dict [ str , str ]] : ", "funcdef": "def"}, {"fullname": "debeir.datasets.factory", "modulename": "debeir.datasets.factory", "kind": "module", "doc": "
\n"}, {"fullname": "debeir.datasets.factory.get_index_name", "modulename": "debeir.datasets.factory", "qualname": "get_index_name", "kind": "function", "doc": "Get the index name from the config without parsing as a TOML
\n\nParameters \n\n\n\nReturns \n", "signature": "(config_fp ): ", "funcdef": "def"}, {"fullname": "debeir.datasets.factory.factory_fn", "modulename": "debeir.datasets.factory", "qualname": "factory_fn", "kind": "function", "doc": "Factory method for creating the parsed topics, config object, query object and query executor object
\n\nParameters \n\n\nconfig_fp : Config file path \nindex : Index to search \n \n\nReturns \n\n\nQuery, Config, Parser, Executor, Evaluator\n
\n \n", "signature": "(\tconfig_fp , \tindex = None ) -> (<class 'debeir.core.query.Query'>, <class 'debeir.core.config.GenericConfig'>, <class 'debeir.core.parser.Parser'>, <class 'debeir.core.executor.GenericElasticsearchExecutor'>, <class 'debeir.evaluation.evaluator.Evaluator'>): ", "funcdef": "def"}, {"fullname": "debeir.datasets.factory.config_factory", "modulename": "debeir.datasets.factory", "qualname": "config_factory", "kind": "function", "doc": "Factory method for creating configs
\n\nParameters \n\n\npath : Config path \nconfig_cls : Config class to instantiate \nargs_dict : Arguments to consider \n \n\nReturns \n\n\nA config object\n
\n \n", "signature": "(\tpath : Union [ str , pathlib . Path ] = None , \tconfig_cls : Type [ debeir . core . config . Config ] = None , \targs_dict : Dict = None ): ", "funcdef": "def"}, {"fullname": "debeir.datasets.factory.get_nir_config", "modulename": "debeir.datasets.factory", "qualname": "get_nir_config", "kind": "function", "doc": "
\n", "signature": "(nir_config , * args , ignore_errors = False , ** kwargs ): ", "funcdef": "def"}, {"fullname": "debeir.datasets.factory.apply_nir_config", "modulename": "debeir.datasets.factory", "qualname": "apply_nir_config", "kind": "function", "doc": "Decorator that applies the NIR config settings to the current function\nReplaces arguments and keywords arguments with those found in the config
\n\nParameters \n\n\n\nReturns \n", "signature": "(func ): ", "funcdef": "def"}, {"fullname": "debeir.datasets.marco", "modulename": "debeir.datasets.marco", "kind": "module", "doc": "
\n"}, {"fullname": "debeir.datasets.marco.MarcoElasticsearchExecutor", "modulename": "debeir.datasets.marco", "qualname": "MarcoElasticsearchExecutor", "kind": "class", "doc": "Generic Executor class for Elasticsearch
\n", "bases": "debeir.core.executor.GenericElasticsearchExecutor"}, {"fullname": "debeir.datasets.marco.MarcoElasticsearchExecutor.__init__", "modulename": "debeir.datasets.marco", "qualname": "MarcoElasticsearchExecutor.__init__", "kind": "function", "doc": "
\n", "signature": "(\ttopics : Dict [ Union [ str , int ], Dict [ str , str ]] , \tclient : elasticsearch . AsyncElasticsearch , \tindex_name : str , \toutput_file : str , \tquery : debeir . core . query . GenericElasticsearchQuery , \tencoder : Optional [ debeir . rankers . transformer_sent_encoder . Encoder ] = None , \tconfig = None , \t* args , \t** kwargs ) "}, {"fullname": "debeir.datasets.marco.MarcoElasticsearchExecutor.generate_query", "modulename": "debeir.datasets.marco", "qualname": "MarcoElasticsearchExecutor.generate_query", "kind": "function", "doc": "Generates a standard BM25 query given the topic number
\n\nParameters \n\n\ntopic_num : Query topic number to generate \nbest_fields : Whether to use a curated list of fields \nkwargs : \n \n\nReturns \n", "signature": "(self , topic_num , best_fields = True , ** kwargs ): ", "funcdef": "def"}, {"fullname": "debeir.datasets.marco.MarcoElasticsearchExecutor.generate_embedding_query", "modulename": "debeir.datasets.marco", "qualname": "MarcoElasticsearchExecutor.generate_embedding_query", "kind": "function", "doc": "Executes an NIR-style query with combined scoring.
\n\nParameters \n\n\ntopic_num : \ncosine_weights : \nquery_weights : \nnorm_weight : \nautomatic_scores : \nkwargs : \n \n\nReturns \n", "signature": "(\tself , \ttopic_num , \tcosine_weights = None , \tquery_weights = None , \tnorm_weight = 2.15 , \tautomatic_scores = None , \t** kwargs ): ", "funcdef": "def"}, {"fullname": "debeir.datasets.marco.MarcoElasticsearchExecutor.execute_query", "modulename": "debeir.datasets.marco", "qualname": "MarcoElasticsearchExecutor.execute_query", "kind": "function", "doc": "Execute a query given parameters
\n\nParameters \n\n\n", "signature": "(\tself , \tquery = None , \ttopic_num = None , \tablation = False , \tquery_type = 'query' , \t** kwargs ): ", "funcdef": "async def"}, {"fullname": "debeir.datasets.marco.MarcoQueryConfig", "modulename": "debeir.datasets.marco", "qualname": "MarcoQueryConfig", "kind": "class", "doc": "
\n", "bases": "debeir.core.config.GenericConfig"}, {"fullname": "debeir.datasets.marco.MarcoQueryConfig.__init__", "modulename": "debeir.datasets.marco", "qualname": "MarcoQueryConfig.__init__", "kind": "function", "doc": "
\n", "signature": "(\tquery_type : str , \tindex : str = None , \tencoder_normalize : bool = True , \tablations : bool = False , \tnorm_weight : float = None , \tautomatic : bool = None , \tencoder : object = None , \tencoder_fp : str = None , \tquery_weights : List [ float ] = None , \tcosine_weights : List [ float ] = None , \tevaluate : bool = False , \tqrels : str = None , \tconfig_fn : str = None , \tquery_fn : str = None , \tparser_fn : str = None , \texecutor_fn : str = None , \tcosine_ceiling : float = None , \ttopics_path : str = None , \treturn_id_only : bool = False , \toverwrite_output_if_exists : bool = False , \toutput_file : str = None , \trun_name : str = None ) "}, {"fullname": "debeir.datasets.marco.MarcoQueryConfig.validate", "modulename": "debeir.datasets.marco", "qualname": "MarcoQueryConfig.validate", "kind": "function", "doc": "Validates if the config is correct.\nMust be implemented by inherited classes.
\n", "signature": "(self ): ", "funcdef": "def"}, {"fullname": "debeir.datasets.marco.MarcoQueryConfig.from_toml", "modulename": "debeir.datasets.marco", "qualname": "MarcoQueryConfig.from_toml", "kind": "function", "doc": "Instantiates a Config object from a toml file
\n\nParameters \n\n\nfp : File path of the Config TOML file \nfield_class : Class of the Config object to be instantiated \nargs : Arguments to be passed to Config \nkwargs : Keyword arguments to be passed \n \n\nReturns \n\n\nA instantiated and validated Config object.\n
\n \n", "signature": "(cls , fp : str , * args , ** kwargs ) -> debeir . datasets . marco . MarcoQueryConfig : ", "funcdef": "def"}, {"fullname": "debeir.datasets.marco.MarcoQueryConfig.from_dict", "modulename": "debeir.datasets.marco", "qualname": "MarcoQueryConfig.from_dict", "kind": "function", "doc": "Instantiates a Config object from a dictionary
\n\nParameters \n\n\ndata_class : \nkwargs : \n \n\nReturns \n", "signature": "(cls , ** kwargs ) -> debeir . datasets . marco . MarcoQueryConfig : ", "funcdef": "def"}, {"fullname": "debeir.datasets.trec_clinical_trials", "modulename": "debeir.datasets.trec_clinical_trials", "kind": "module", "doc": "
\n"}, {"fullname": "debeir.datasets.trec_clinical_trials.TREClinicalTrialDocumentParser", "modulename": "debeir.datasets.trec_clinical_trials", "qualname": "TREClinicalTrialDocumentParser", "kind": "class", "doc": "Parser for Clinical Trials topics
\n", "bases": "debeir.core.parser.XMLParser"}, {"fullname": "debeir.datasets.trec_clinical_trials.TREClinicalTrialDocumentParser.extract", "modulename": "debeir.datasets.trec_clinical_trials", "qualname": "TREClinicalTrialDocumentParser.extract", "kind": "function", "doc": "
\n", "signature": "(cls , path ) -> Dict : ", "funcdef": "def"}, {"fullname": "debeir.datasets.trec_clinical_trials.TrecClincialElasticsearchQuery", "modulename": "debeir.datasets.trec_clinical_trials", "qualname": "TrecClincialElasticsearchQuery", "kind": "class", "doc": "A generic elasticsearch query. Contains methods for NIR-style (embedding) queries and normal BM25 queries.\nRequires topics, configs to be included
\n", "bases": "debeir.core.query.GenericElasticsearchQuery"}, {"fullname": "debeir.datasets.trec_clinical_trials.TrecClincialElasticsearchQuery.__init__", "modulename": "debeir.datasets.trec_clinical_trials", "qualname": "TrecClincialElasticsearchQuery.__init__", "kind": "function", "doc": "
\n", "signature": "(topics , config , * args , ** kwargs ) "}, {"fullname": "debeir.datasets.trec_covid", "modulename": "debeir.datasets.trec_covid", "kind": "module", "doc": "
\n"}, {"fullname": "debeir.datasets.trec_covid.TrecCovidParser", "modulename": "debeir.datasets.trec_covid", "qualname": "TrecCovidParser", "kind": "class", "doc": "Load topics from an XML file
\n", "bases": "debeir.core.parser.XMLParser"}, {"fullname": "debeir.datasets.trec_covid.TrecCovidParser.get_topics", "modulename": "debeir.datasets.trec_covid", "qualname": "TrecCovidParser.get_topics", "kind": "function", "doc": "Instance method for getting topics, forwards instance self parameters to the _get_topics class method.
\n", "signature": "(cls , xmlfile ) -> Dict [ int , Dict [ str , str ]] : ", "funcdef": "def"}, {"fullname": "debeir.datasets.trec_covid.TrecElasticsearchQuery", "modulename": "debeir.datasets.trec_covid", "qualname": "TrecElasticsearchQuery", "kind": "class", "doc": "A generic elasticsearch query. Contains methods for NIR-style (embedding) queries and normal BM25 queries.\nRequires topics, configs to be included
\n", "bases": "debeir.core.query.GenericElasticsearchQuery"}, {"fullname": "debeir.datasets.trec_covid.TrecElasticsearchQuery.__init__", "modulename": "debeir.datasets.trec_covid", "qualname": "TrecElasticsearchQuery.__init__", "kind": "function", "doc": "
\n", "signature": "(topics , config , * args , ** kwargs ) "}, {"fullname": "debeir.datasets.types", "modulename": "debeir.datasets.types", "kind": "module", "doc": "
\n"}, {"fullname": "debeir.datasets.types.InputExample", "modulename": "debeir.datasets.types", "qualname": "InputExample", "kind": "class", "doc": "Copied from Sentence Transformer Library\nStructure for one input example with texts, the label and a unique id
\n"}, {"fullname": "debeir.datasets.types.InputExample.__init__", "modulename": "debeir.datasets.types", "qualname": "InputExample.__init__", "kind": "function", "doc": "Creates one InputExample with the given texts, guid and label
\n\n:param guid\n id for the example\n:param texts\n the texts for the example. Note, str.strip() is called on the texts\n:param label\n the label for the example
\n", "signature": "(\tguid : str = '' , \ttexts : List [ str ] = None , \tlabel : Union [ int , float ] = 0 ) "}, {"fullname": "debeir.datasets.types.InputExample.get_label", "modulename": "debeir.datasets.types", "qualname": "InputExample.get_label", "kind": "function", "doc": "
\n", "signature": "(self ): ", "funcdef": "def"}, {"fullname": "debeir.datasets.types.InputExample.to_dict", "modulename": "debeir.datasets.types", "qualname": "InputExample.to_dict", "kind": "function", "doc": "
\n", "signature": "(cls , data : List [ debeir . datasets . types . InputExample ] ): ", "funcdef": "def"}, {"fullname": "debeir.datasets.types.InputExample.from_parser_output", "modulename": "debeir.datasets.types", "qualname": "InputExample.from_parser_output", "kind": "function", "doc": "
\n", "signature": "(cls , data ): ", "funcdef": "def"}, {"fullname": "debeir.datasets.types.RelevanceExample", "modulename": "debeir.datasets.types", "qualname": "RelevanceExample", "kind": "class", "doc": "Converts Relevance Labels to 0 - 1
\n", "bases": "InputExample"}, {"fullname": "debeir.datasets.types.RelevanceExample.__init__", "modulename": "debeir.datasets.types", "qualname": "RelevanceExample.__init__", "kind": "function", "doc": "Creates one InputExample with the given texts, guid and label
\n\n:param guid\n id for the example\n:param texts\n the texts for the example. Note, str.strip() is called on the texts\n:param label\n the label for the example
\n", "signature": "(max_score = 2 , * args , ** kwargs ) "}, {"fullname": "debeir.datasets.types.RelevanceExample.get_label", "modulename": "debeir.datasets.types", "qualname": "RelevanceExample.get_label", "kind": "function", "doc": "
\n", "signature": "(self ): ", "funcdef": "def"}, {"fullname": "debeir.datasets.types.RelevanceExample.relevance", "modulename": "debeir.datasets.types", "qualname": "RelevanceExample.relevance", "kind": "function", "doc": "Returns \n\n\nReturns a normalised score for relevance between 0 - 1\n
\n \n", "signature": "(self ): ", "funcdef": "def"}, {"fullname": "debeir.datasets.types.DatasetTypes", "modulename": "debeir.datasets.types", "qualname": "DatasetTypes", "kind": "class", "doc": "A collection of common dataset types that is usable in the library.
\n", "bases": "enum.Enum"}, {"fullname": "debeir.datasets.utils", "modulename": "debeir.datasets.utils", "kind": "module", "doc": "
\n"}, {"fullname": "debeir.datasets.utils.CrossValidatorDataset", "modulename": "debeir.datasets.utils", "qualname": "CrossValidatorDataset", "kind": "class", "doc": "Cross Validator Dataset
\n"}, {"fullname": "debeir.datasets.utils.CrossValidatorDataset.__init__", "modulename": "debeir.datasets.utils", "qualname": "CrossValidatorDataset.__init__", "kind": "function", "doc": "
\n", "signature": "(dataset , cross_validator , n_folds , x_attr = 'text' , y_attr = 'label' ) "}, {"fullname": "debeir.datasets.utils.CrossValidatorDataset.prepare_cross_validator", "modulename": "debeir.datasets.utils", "qualname": "CrossValidatorDataset.prepare_cross_validator", "kind": "function", "doc": "Prepare the cross validator dataset object that will internally produce the folds.
\n\nParameters \n\n\ndata : Dataset to be used. Should be a list of dicts, or list of [x,y] or a Dataset object from data_sets \nevaluator : Evaluator to use for checking results \nn_splits : Number of cross validation splits, k-fold (stratified) \nseed : Seed to use (default 42) \ny_attr : Label, or idx of the y label \nx_attr : Label or idx of the x label (not directly used) \n \n", "signature": "(\tcls , \tdata , \tevaluator : debeir . evaluation . evaluator . Evaluator , \tn_splits : int , \tx_attr , \ty_attr , \tseed = 42 ) -> debeir . datasets . utils . CrossValidatorDataset : ", "funcdef": "def"}, {"fullname": "debeir.datasets.utils.CrossValidatorDataset.get_fold", "modulename": "debeir.datasets.utils", "qualname": "CrossValidatorDataset.get_fold", "kind": "function", "doc": "Get the fold and returns a dataset.DataDict object with\nDataDict{'train': ..., 'val': ...}
\n\nParameters \n\n\n", "signature": "(self , idx ) -> datasets . dataset_dict . DatasetDict : ", "funcdef": "def"}, {"fullname": "debeir.engines", "modulename": "debeir.engines", "kind": "module", "doc": "WIP
\n\nImplemented Search Engines to run queries against.
\n"}, {"fullname": "debeir.engines.client", "modulename": "debeir.engines.client", "kind": "module", "doc": "
\n"}, {"fullname": "debeir.engines.client.Client", "modulename": "debeir.engines.client", "qualname": "Client", "kind": "class", "doc": "Overarching client interface object that contains references to different clients for search\nAllows sharing between function calls
\n"}, {"fullname": "debeir.engines.client.Client.__init__", "modulename": "debeir.engines.client", "qualname": "Client.__init__", "kind": "function", "doc": "
\n", "signature": "(\tes_client : elasticsearch . AsyncElasticsearch = None , \tsolr_client : object = None , \tgeneric_client : object = None ) "}, {"fullname": "debeir.engines.client.Client.build_from_config", "modulename": "debeir.engines.client", "qualname": "Client.build_from_config", "kind": "function", "doc": "Build client from engine config
\n\nParameters \n\n\nengine_type : \nengine_config : \n \n\nReturns \n", "signature": "(cls , engine_type , engine_config ) -> debeir . engines . client . Client : ", "funcdef": "def"}, {"fullname": "debeir.engines.client.Client.get_client", "modulename": "debeir.engines.client", "qualname": "Client.get_client", "kind": "function", "doc": "
\n", "signature": "(self , engine ): ", "funcdef": "def"}, {"fullname": "debeir.engines.client.Client.close", "modulename": "debeir.engines.client", "qualname": "Client.close", "kind": "function", "doc": "Generically close all contained client objects
\n", "signature": "(self ): ", "funcdef": "async def"}, {"fullname": "debeir.engines.dummyindex", "modulename": "debeir.engines.dummyindex", "kind": "module", "doc": "
\n"}, {"fullname": "debeir.engines.dummyindex.index", "modulename": "debeir.engines.dummyindex.index", "kind": "module", "doc": "
\n"}, {"fullname": "debeir.engines.dummyindex.index.DummyIndex", "modulename": "debeir.engines.dummyindex.index", "qualname": "DummyIndex", "kind": "class", "doc": "
\n"}, {"fullname": "debeir.engines.dummyindex.index.DummyIndex.__init__", "modulename": "debeir.engines.dummyindex.index", "qualname": "DummyIndex.__init__", "kind": "function", "doc": "
\n", "signature": "() "}, {"fullname": "debeir.engines.dummyindex.index.DummyIndex.get_documents", "modulename": "debeir.engines.dummyindex.index", "qualname": "DummyIndex.get_documents", "kind": "function", "doc": "
\n", "signature": "(self ): ", "funcdef": "def"}, {"fullname": "debeir.engines.dummyindex.index.DummyIndex.query", "modulename": "debeir.engines.dummyindex.index", "qualname": "DummyIndex.query", "kind": "function", "doc": "
\n", "signature": "(self ): ", "funcdef": "def"}, {"fullname": "debeir.engines.dummyindex.index.DummyIndex.scorer", "modulename": "debeir.engines.dummyindex.index", "qualname": "DummyIndex.scorer", "kind": "function", "doc": "
\n", "signature": "(self ): ", "funcdef": "def"}, {"fullname": "debeir.engines.dummyindex.index.es_isup", "modulename": "debeir.engines.dummyindex.index", "qualname": "es_isup", "kind": "function", "doc": "
\n", "signature": "(es_client : elasticsearch . AsyncElasticsearch ): ", "funcdef": "async def"}, {"fullname": "debeir.engines.elasticsearch", "modulename": "debeir.engines.elasticsearch", "kind": "module", "doc": "Library code for interacting with the elasticsearch engine
\n\nContains many helper functions for asynchronous and fast querying, with optional caching available
\n"}, {"fullname": "debeir.engines.elasticsearch.change_bm25", "modulename": "debeir.engines.elasticsearch.change_bm25", "kind": "module", "doc": "
\n"}, {"fullname": "debeir.engines.elasticsearch.change_bm25.change_bm25_params", "modulename": "debeir.engines.elasticsearch.change_bm25", "qualname": "change_bm25_params", "kind": "function", "doc": "Change the BM25 parameters of the elasticsearch BM25 ranker.
\n\nParameters \n\n\nindex : The elasticsearch index name \nk1 : The k parameter for BM25 (default 1.2) [Usually 0-3] [Term saturation constant] ->\nThe higher the k value, the more weight given to document that repeat terms. \nb : The b parameter for BM25 (default 0.75) [Usually 0-1] [Document length constant] ->\nThe higher the b value, the higher it penalises longer documents. \nbase_url : The elasticsearch base URL for API requests (without index suffix) \n \n", "signature": "(index , k1 : float , b : float , base_url : str = 'http://localhost:9200' ): ", "funcdef": "def"}, {"fullname": "debeir.engines.elasticsearch.executor", "modulename": "debeir.engines.elasticsearch.executor", "kind": "module", "doc": "
\n"}, {"fullname": "debeir.engines.elasticsearch.executor.ElasticsearchExecutor", "modulename": "debeir.engines.elasticsearch.executor", "qualname": "ElasticsearchExecutor", "kind": "class", "doc": "Executes an elasticsearch query given the query generated from the config, topics and query class object.
\n\nComputes regular patterns of queries expected from general IR topics and indexes.\nIncludes:\n 1. Reranking\n 2. End-to-End Neural IR\n 3. Statistical keyword matching
\n"}, {"fullname": "debeir.engines.elasticsearch.executor.ElasticsearchExecutor.__init__", "modulename": "debeir.engines.elasticsearch.executor", "qualname": "ElasticsearchExecutor.__init__", "kind": "function", "doc": "
\n", "signature": "(\ttopics : Dict [ Union [ str , int ], Dict [ str , str ]] , \tclient : elasticsearch . AsyncElasticsearch , \tindex_name : str , \toutput_file : str , \tquery : debeir . core . query . GenericElasticsearchQuery , \tencoder : Optional [ debeir . rankers . transformer_sent_encoder . Encoder ] , \treturn_size : int = 1000 , \ttest = False , \treturn_id_only = True , \tconfig = None ) "}, {"fullname": "debeir.engines.elasticsearch.executor.ElasticsearchExecutor.generate_query", "modulename": "debeir.engines.elasticsearch.executor", "qualname": "ElasticsearchExecutor.generate_query", "kind": "function", "doc": "Generates a query given a topic number from the list of topics
\n\nParameters \n\n\n", "signature": "(self , topic_num ): ", "funcdef": "def"}, {"fullname": "debeir.engines.elasticsearch.executor.ElasticsearchExecutor.execute_query", "modulename": "debeir.engines.elasticsearch.executor", "qualname": "ElasticsearchExecutor.execute_query", "kind": "function", "doc": "Execute a query given parameters
\n\nParameters \n\n\n", "signature": "(self , * args , ** kwargs ): ", "funcdef": "def"}, {"fullname": "debeir.engines.elasticsearch.executor.ElasticsearchExecutor.run_all_queries", "modulename": "debeir.engines.elasticsearch.executor", "qualname": "ElasticsearchExecutor.run_all_queries", "kind": "function", "doc": "A generic function that will asynchronously run all topics using the execute_query() method
\n\nParameters \n\n\nquery_type : Which query to execute. Query_type determines which method is used to generate the queries\nfrom self.query.query_funcs: Dict[str, func] \nreturn_results : Whether to return raw results from the client. Useful for analysing results directly or\nfor computing the BM25 scores for log normalization in NIR-style scoring \nreturn_size : Number of documents to return. Overrides the config value if exists. \nreturn_id_only : Return the ID of the document only, rather than the full source document. \nargs : Arguments to pass to the execute_query method \nkwargs : Keyword arguments to pass to the execute_query method \n \n\nReturns \n\n\nA list of results if return_results = True else an empty list is returned.\n
\n \n", "signature": "(\tself , \tquery_type = None , \treturn_results = False , \treturn_size : int = None , \treturn_id_only : bool = False , \t** kwargs ) -> List : ", "funcdef": "async def"}, {"fullname": "debeir.engines.elasticsearch.generate_script_score", "modulename": "debeir.engines.elasticsearch.generate_script_score", "kind": "module", "doc": "
\n"}, {"fullname": "debeir.engines.elasticsearch.generate_script_score.SourceBuilder", "modulename": "debeir.engines.elasticsearch.generate_script_score", "qualname": "SourceBuilder", "kind": "class", "doc": "Builds Script Score source for NIR-style queries in elasticsearch\nUses the painless language
\n\nThis is a string builder class
\n"}, {"fullname": "debeir.engines.elasticsearch.generate_script_score.SourceBuilder.__init__", "modulename": "debeir.engines.elasticsearch.generate_script_score", "qualname": "SourceBuilder.__init__", "kind": "function", "doc": "
\n", "signature": "() "}, {"fullname": "debeir.engines.elasticsearch.generate_script_score.SourceBuilder.add_preamble", "modulename": "debeir.engines.elasticsearch.generate_script_score", "qualname": "SourceBuilder.add_preamble", "kind": "function", "doc": "Adds preamble to the internal string\nThis will return the bm25 score if the normalization constant is below 0
\n", "signature": "(self ): ", "funcdef": "def"}, {"fullname": "debeir.engines.elasticsearch.generate_script_score.SourceBuilder.add_log_score", "modulename": "debeir.engines.elasticsearch.generate_script_score", "qualname": "SourceBuilder.add_log_score", "kind": "function", "doc": "Adds the BM25 log score line
\n\nParameters \n\n\nignore_below_one : Ignore all scores below 1.0 as Log(1) = 0. Otherwise, just ignore Log(0 and under). \n \n\nReturns \n\n\nSourceBuilder\n
\n \n", "signature": "(\tself , \tignore_below_one = False ) -> debeir . engines . elasticsearch . generate_script_score . SourceBuilder : ", "funcdef": "def"}, {"fullname": "debeir.engines.elasticsearch.generate_script_score.SourceBuilder.add_embed_field", "modulename": "debeir.engines.elasticsearch.generate_script_score", "qualname": "SourceBuilder.add_embed_field", "kind": "function", "doc": "Adds a cosine score line.
\n\nParameters \n\n\nqfield : Query field \nfield : Document facet field \n \n\nReturns \n", "signature": "(\tself , \tqfield , \tfield ) -> debeir . engines . elasticsearch . generate_script_score . SourceBuilder : ", "funcdef": "def"}, {"fullname": "debeir.engines.elasticsearch.generate_script_score.SourceBuilder.finish", "modulename": "debeir.engines.elasticsearch.generate_script_score", "qualname": "SourceBuilder.finish", "kind": "function", "doc": "Finalises the script score and returns the internal string
\n\nReturns \n\n\nA string containing the script score query\n
\n \n", "signature": "(self ): ", "funcdef": "def"}, {"fullname": "debeir.engines.elasticsearch.generate_script_score.generate_source", "modulename": "debeir.engines.elasticsearch.generate_script_score", "qualname": "generate_source", "kind": "function", "doc": "Generates the script source based off a set of input fields and facets
\n\nParameters \n\n\nqfields : Query fields (or topic fields) \nfields : Document facets to compute cosine similarity on \n \n\nReturns \n", "signature": "(qfields : Union [ list , str ] , fields ) -> str : ", "funcdef": "def"}, {"fullname": "debeir.engines.elasticsearch.generate_script_score.check_params_is_valid", "modulename": "debeir.engines.elasticsearch.generate_script_score", "qualname": "check_params_is_valid", "kind": "function", "doc": "Validate if the parameters for the script score passes a simple sanity check.
\n\nParameters \n\n\n", "signature": "(params , qfields ): ", "funcdef": "def"}, {"fullname": "debeir.engines.elasticsearch.generate_script_score.generate_script", "modulename": "debeir.engines.elasticsearch.generate_script_score", "qualname": "generate_script", "kind": "function", "doc": "Parameters for creating the script
\n\nParameters \n\n\nfields : Document fields to search \nparams : Parameters for the script \nsource_generator : Function that will generate the script \nqfields : Query fields to search from (topic facets) \n \n\nReturns \n", "signature": "(\tfields , \tparams , \tsource_generator =< function generate_source > , \tqfields = 'q_eb' ) -> Dict : ", "funcdef": "def"}, {"fullname": "debeir.engines.solr", "modulename": "debeir.engines.solr", "kind": "module", "doc": "
\n"}, {"fullname": "debeir.evaluation", "modulename": "debeir.evaluation", "kind": "module", "doc": "Evaluation for retrieved results.
\n\nWorks for TREC-style queries or for out-the-box returned results from the implemented search engines.
\n"}, {"fullname": "debeir.evaluation.cross_validation", "modulename": "debeir.evaluation.cross_validation", "kind": "module", "doc": "
\n"}, {"fullname": "debeir.evaluation.cross_validation.split_k_fold", "modulename": "debeir.evaluation.cross_validation", "qualname": "split_k_fold", "kind": "function", "doc": "
\n", "signature": "(n_fold , data_files ): ", "funcdef": "def"}, {"fullname": "debeir.evaluation.cross_validation.CrossValidatorTypes", "modulename": "debeir.evaluation.cross_validation", "qualname": "CrossValidatorTypes", "kind": "class", "doc": "Cross Validator Strategies for separating the dataset
\n", "bases": "enum.Enum"}, {"fullname": "debeir.evaluation.cross_validation.CrossValidatorTypes.Stratified", "modulename": "debeir.evaluation.cross_validation", "qualname": "CrossValidatorTypes.Stratified", "kind": "variable", "doc": "
\n", "default_value": " = <CrossValidatorTypes.Stratified: 'StratifiedKFold'>"}, {"fullname": "debeir.evaluation.cross_validation.CrossValidatorTypes.KFold", "modulename": "debeir.evaluation.cross_validation", "qualname": "CrossValidatorTypes.KFold", "kind": "variable", "doc": "
\n", "default_value": " = <CrossValidatorTypes.KFold: 'KFold'>"}, {"fullname": "debeir.evaluation.cross_validation.CrossValidator", "modulename": "debeir.evaluation.cross_validation", "qualname": "CrossValidator", "kind": "class", "doc": "Cross Validator Class for different types of data_sets
\n\nE.g. List -> [[Data], label]\n List[Dict] -> {\"data\": Data, \"label\": label}\n Huggingface Dataset Object -> Data(set=\"train\", label = \"label\").select(idx)
\n"}, {"fullname": "debeir.evaluation.cross_validation.CrossValidator.__init__", "modulename": "debeir.evaluation.cross_validation", "qualname": "CrossValidator.__init__", "kind": "function", "doc": "
\n", "signature": "(\tdataset : Union [ List , List [ Dict ], datasets . arrow_dataset . Dataset ] , \tx_idx_label_or_attr : Union [ str , int ] , \ty_idx_label_or_attr : Union [ str , int ] , \tcross_validator_type: [<class 'str'>, <enum 'CrossValidatorTypes'>] = <CrossValidatorTypes.Stratified: 'StratifiedKFold'>, \tseed = 42 , \tn_splits = 5 ) "}, {"fullname": "debeir.evaluation.cross_validation.CrossValidator.get_fold", "modulename": "debeir.evaluation.cross_validation", "qualname": "CrossValidator.get_fold", "kind": "function", "doc": "Parameters \n\n\nfold_num : Which fold to pick \n \n\nReturns \n", "signature": "(self , fold_num : int ): ", "funcdef": "def"}, {"fullname": "debeir.evaluation.evaluator", "modulename": "debeir.evaluation.evaluator", "kind": "module", "doc": "
\n"}, {"fullname": "debeir.evaluation.evaluator.Evaluator", "modulename": "debeir.evaluation.evaluator", "qualname": "Evaluator", "kind": "class", "doc": "Evaluation class for computing metrics from TREC-style files
\n"}, {"fullname": "debeir.evaluation.evaluator.Evaluator.__init__", "modulename": "debeir.evaluation.evaluator", "qualname": "Evaluator.__init__", "kind": "function", "doc": "
\n", "signature": "(qrels : str , metrics : List [ str ] ) "}, {"fullname": "debeir.evaluation.evaluator.Evaluator.evaluate_runs", "modulename": "debeir.evaluation.evaluator", "qualname": "Evaluator.evaluate_runs", "kind": "function", "doc": "Evaluates the TREC-style results from an input result list or file
\n\nParameters \n\n\nres : Results file path or raw results list \nkwargs : Keyword arguments to pass to the underlying analysis_tools_ir.parse_run library \n \n\nReturns \n", "signature": "(self , res : Union [ str , List [ str ]] , ** kwargs ): ", "funcdef": "def"}, {"fullname": "debeir.evaluation.evaluator.Evaluator.average_all_metrics", "modulename": "debeir.evaluation.evaluator", "qualname": "Evaluator.average_all_metrics", "kind": "function", "doc": "Averages the metric per topic scores into a single averaged score.
\n\nParameters \n\n\nruns: Parsed run dictionary: {metric_name@depth : Run object} \nlogger : Logger to print metrics \n \n", "signature": "(\tself , \truns : Dict , \tlogger : < loguru . logger handlers = [( id = 0 , level = 10 , sink =< _io . StringIO object at 0x105cfa710 > )] > ): ", "funcdef": "def"}, {"fullname": "debeir.evaluation.evaluator.Evaluator.sigtests", "modulename": "debeir.evaluation.evaluator", "qualname": "Evaluator.sigtests", "kind": "function", "doc": "Run a paired significance test on two result files
\n\nParameters \n\n\nresults_a : \nresults_b : \n \n\nReturns \n", "signature": "(self , results_a , results_b ): ", "funcdef": "def"}, {"fullname": "debeir.evaluation.evaluator.Evaluator.build_from_config", "modulename": "debeir.evaluation.evaluator", "qualname": "Evaluator.build_from_config", "kind": "function", "doc": "
\n", "signature": "(\tcls , \tconfig : debeir . core . config . GenericConfig , \tmetrics_config : debeir . core . config . MetricsConfig ): ", "funcdef": "def"}, {"fullname": "debeir.evaluation.residual_scoring", "modulename": "debeir.evaluation.residual_scoring", "kind": "module", "doc": "
\n"}, {"fullname": "debeir.evaluation.residual_scoring.ResidualEvaluator", "modulename": "debeir.evaluation.residual_scoring", "qualname": "ResidualEvaluator", "kind": "class", "doc": "Residual Scoring is the scoring of a subset of documents or the residiaul. The residual is created by removing documents from the collection and qrels.
\n", "bases": "debeir.evaluation.evaluator.Evaluator"}, {"fullname": "debeir.evaluation.residual_scoring.ResidualEvaluator.__init__", "modulename": "debeir.evaluation.residual_scoring", "qualname": "ResidualEvaluator.__init__", "kind": "function", "doc": "Args:\n qrels (str): Path to qrels \n metrics (List[str]): A list of metrics with depth e.g. NDCG@1000\n filter_ids (Dict[str, List[str]]): A list of IDs to remove from the collection given by Dict[Topic_num, [Docids]]
\n", "signature": "(qrels : str , metrics : List [ str ] , filter_ids : Dict [ str , List [ str ]] ) "}, {"fullname": "debeir.evaluation.residual_scoring.ResidualEvaluator.evaluate_runs", "modulename": "debeir.evaluation.residual_scoring", "qualname": "ResidualEvaluator.evaluate_runs", "kind": "function", "doc": "Run the residual evaluation for the runs
\n\nParameters \n\n\nres : The results to run the evaluator against \nwith_trec_binary : Use the TREC C binary instead of the default Python library, defaults to False \n \n\nReturns \n\n\n A dictionary of supplied metrics of the results against the qrels
\n \n", "signature": "(self , res : Union [ str , List [ str ]] , with_trec_binary = False , ** kwargs ): ", "funcdef": "def"}, {"fullname": "debeir.models", "modulename": "debeir.models", "kind": "module", "doc": "
\n"}, {"fullname": "debeir.models.colbert", "modulename": "debeir.models.colbert", "kind": "module", "doc": "
\n"}, {"fullname": "debeir.models.colbert.CoLBERTConfig", "modulename": "debeir.models.colbert", "qualname": "CoLBERTConfig", "kind": "class", "doc": "
\n"}, {"fullname": "debeir.models.colbert.CoLBERTConfig.__init__", "modulename": "debeir.models.colbert", "qualname": "CoLBERTConfig.__init__", "kind": "function", "doc": "
\n", "signature": "(** kwargs ) "}, {"fullname": "debeir.models.colbert.CoLBERTConfig.save", "modulename": "debeir.models.colbert", "qualname": "CoLBERTConfig.save", "kind": "function", "doc": "Parameters \n\n\nfname : file name \npath : Path to save \n \n", "signature": "(self , path , fname = 'colbert_config.json' ): ", "funcdef": "def"}, {"fullname": "debeir.models.colbert.CoLBERTConfig.load", "modulename": "debeir.models.colbert", "qualname": "CoLBERTConfig.load", "kind": "function", "doc": "Load the ColBERT config from path (don't point to file name just directory)
\n\nReturns \n", "signature": "(cls , path , fname = 'colbert_config.json' ): ", "funcdef": "def"}, {"fullname": "debeir.models.colbert.ConvolutionalBlock", "modulename": "debeir.models.colbert", "qualname": "ConvolutionalBlock", "kind": "class", "doc": "Base class for all neural network modules.
\n\nYour models should also subclass this class.
\n\nModules can also contain other Modules, allowing to nest them in\na tree structure. You can assign the submodules as regular attributes::
\n\nimport torch.nn as nn\nimport torch.nn.functional as F\n\nclass Model(nn.Module):\n def __init__(self):\n super().__init__()\n self.conv1 = nn.Conv2d(1, 20, 5)\n self.conv2 = nn.Conv2d(20, 20, 5)\n\n def forward(self, x):\n x = F.relu(self.conv1(x))\n return F.relu(self.conv2(x))\n
\n\nSubmodules assigned in this way will be registered, and will have their\nparameters converted too when you call to()
, etc.
\n\n\n\n
As per the example above, an __init__()
call to the parent class\nmust be made before assignment on the child.
\n\n
\n\n:ivar training: Boolean represents whether this module is in training or\n evaluation mode.\n:vartype training: bool
\n", "bases": "torch.nn.modules.module.Module"}, {"fullname": "debeir.models.colbert.ConvolutionalBlock.__init__", "modulename": "debeir.models.colbert", "qualname": "ConvolutionalBlock.__init__", "kind": "function", "doc": "Initializes internal Module state, shared by both nn.Module and ScriptModule.
\n", "signature": "(\tin_channels , \tout_channels , \tkernel_size = 1 , \tfirst_stride = 1 , \tact_func=<class 'torch.nn.modules.activation.ReLU'> ) "}, {"fullname": "debeir.models.colbert.ConvolutionalBlock.forward", "modulename": "debeir.models.colbert", "qualname": "ConvolutionalBlock.forward", "kind": "function", "doc": "Defines the computation performed at every call.
\n\nShould be overridden by all subclasses.
\n\n\n\n
Although the recipe for forward pass needs to be defined within\nthis function, one should call the Module
instance afterwards\ninstead of this since the former takes care of running the\nregistered hooks while the latter silently ignores them.
\n\n
\n", "signature": "(self , x ): ", "funcdef": "def"}, {"fullname": "debeir.models.colbert.KMaxPool", "modulename": "debeir.models.colbert", "qualname": "KMaxPool", "kind": "class", "doc": "Base class for all neural network modules.
\n\nYour models should also subclass this class.
\n\nModules can also contain other Modules, allowing to nest them in\na tree structure. You can assign the submodules as regular attributes::
\n\nimport torch.nn as nn\nimport torch.nn.functional as F\n\nclass Model(nn.Module):\n def __init__(self):\n super().__init__()\n self.conv1 = nn.Conv2d(1, 20, 5)\n self.conv2 = nn.Conv2d(20, 20, 5)\n\n def forward(self, x):\n x = F.relu(self.conv1(x))\n return F.relu(self.conv2(x))\n
\n\nSubmodules assigned in this way will be registered, and will have their\nparameters converted too when you call to()
, etc.
\n\n\n\n
As per the example above, an __init__()
call to the parent class\nmust be made before assignment on the child.
\n\n
\n\n:ivar training: Boolean represents whether this module is in training or\n evaluation mode.\n:vartype training: bool
\n", "bases": "torch.nn.modules.module.Module"}, {"fullname": "debeir.models.colbert.KMaxPool.__init__", "modulename": "debeir.models.colbert", "qualname": "KMaxPool.__init__", "kind": "function", "doc": "Initializes internal Module state, shared by both nn.Module and ScriptModule.
\n", "signature": "(k = 1 ) "}, {"fullname": "debeir.models.colbert.KMaxPool.forward", "modulename": "debeir.models.colbert", "qualname": "KMaxPool.forward", "kind": "function", "doc": "Defines the computation performed at every call.
\n\nShould be overridden by all subclasses.
\n\n\n\n
Although the recipe for forward pass needs to be defined within\nthis function, one should call the Module
instance afterwards\ninstead of this since the former takes care of running the\nregistered hooks while the latter silently ignores them.
\n\n
\n", "signature": "(self , x ): ", "funcdef": "def"}, {"fullname": "debeir.models.colbert.visualisation_dump", "modulename": "debeir.models.colbert", "qualname": "visualisation_dump", "kind": "function", "doc": "
\n", "signature": "(argmax , input_tensors ): ", "funcdef": "def"}, {"fullname": "debeir.models.colbert.ResidualBlock", "modulename": "debeir.models.colbert", "qualname": "ResidualBlock", "kind": "class", "doc": "Base class for all neural network modules.
\n\nYour models should also subclass this class.
\n\nModules can also contain other Modules, allowing to nest them in\na tree structure. You can assign the submodules as regular attributes::
\n\nimport torch.nn as nn\nimport torch.nn.functional as F\n\nclass Model(nn.Module):\n def __init__(self):\n super().__init__()\n self.conv1 = nn.Conv2d(1, 20, 5)\n self.conv2 = nn.Conv2d(20, 20, 5)\n\n def forward(self, x):\n x = F.relu(self.conv1(x))\n return F.relu(self.conv2(x))\n
\n\nSubmodules assigned in this way will be registered, and will have their\nparameters converted too when you call to()
, etc.
\n\n\n\n
As per the example above, an __init__()
call to the parent class\nmust be made before assignment on the child.
\n\n
\n\n:ivar training: Boolean represents whether this module is in training or\n evaluation mode.\n:vartype training: bool
\n", "bases": "torch.nn.modules.module.Module"}, {"fullname": "debeir.models.colbert.ResidualBlock.__init__", "modulename": "debeir.models.colbert", "qualname": "ResidualBlock.__init__", "kind": "function", "doc": "Initializes internal Module state, shared by both nn.Module and ScriptModule.
\n", "signature": "(\tin_channels , \tout_channels , \toptional_shortcut = True , \tkernel_size = 1 , \tact_func=<class 'torch.nn.modules.activation.ReLU'> ) "}, {"fullname": "debeir.models.colbert.ResidualBlock.forward", "modulename": "debeir.models.colbert", "qualname": "ResidualBlock.forward", "kind": "function", "doc": "Defines the computation performed at every call.
\n\nShould be overridden by all subclasses.
\n\n\n\n
Although the recipe for forward pass needs to be defined within\nthis function, one should call the Module
instance afterwards\ninstead of this since the former takes care of running the\nregistered hooks while the latter silently ignores them.
\n\n
\n", "signature": "(self , x ): ", "funcdef": "def"}, {"fullname": "debeir.models.colbert.ColBERT", "modulename": "debeir.models.colbert", "qualname": "ColBERT", "kind": "class", "doc": "Base class for all neural network modules.
\n\nYour models should also subclass this class.
\n\nModules can also contain other Modules, allowing to nest them in\na tree structure. You can assign the submodules as regular attributes::
\n\nimport torch.nn as nn\nimport torch.nn.functional as F\n\nclass Model(nn.Module):\n def __init__(self):\n super().__init__()\n self.conv1 = nn.Conv2d(1, 20, 5)\n self.conv2 = nn.Conv2d(20, 20, 5)\n\n def forward(self, x):\n x = F.relu(self.conv1(x))\n return F.relu(self.conv2(x))\n
\n\nSubmodules assigned in this way will be registered, and will have their\nparameters converted too when you call to()
, etc.
\n\n\n\n
As per the example above, an __init__()
call to the parent class\nmust be made before assignment on the child.
\n\n
\n\n:ivar training: Boolean represents whether this module is in training or\n evaluation mode.\n:vartype training: bool
\n", "bases": "torch.nn.modules.module.Module"}, {"fullname": "debeir.models.colbert.ColBERT.__init__", "modulename": "debeir.models.colbert", "qualname": "ColBERT.__init__", "kind": "function", "doc": "Initializes internal Module state, shared by both nn.Module and ScriptModule.
\n", "signature": "(\tbert_model_args , \tbert_model_kwargs , \tconfig : transformers . models . bert . configuration_bert . BertConfig , \tdevice : str , \tmax_seq_len : int = 128 , \tk : int = 8 , \toptional_shortcut : bool = True , \thidden_neurons : int = 2048 , \tuse_batch_norms : bool = True , \tuse_trans_blocks : bool = False , \tresidual_kernel_size : int = 1 , \tdropout_perc : float = 0.5 , \tact_func = 'mish' , \tloss_func = 'cross_entropy_loss' , \t** kwargs ) "}, {"fullname": "debeir.models.colbert.ColBERT.forward", "modulename": "debeir.models.colbert", "qualname": "ColBERT.forward", "kind": "function", "doc": "Defines the computation performed at every call.
\n\nShould be overridden by all subclasses.
\n\n\n\n
Although the recipe for forward pass needs to be defined within\nthis function, one should call the Module
instance afterwards\ninstead of this since the former takes care of running the\nregistered hooks while the latter silently ignores them.
\n\n
\n", "signature": "(self , * args , ** kwargs ): ", "funcdef": "def"}, {"fullname": "debeir.models.colbert.ColBERT.from_config", "modulename": "debeir.models.colbert", "qualname": "ColBERT.from_config", "kind": "function", "doc": "
\n", "signature": "(cls , * args , config_path ): ", "funcdef": "def"}, {"fullname": "debeir.models.colbert.ColBERT.from_pretrained", "modulename": "debeir.models.colbert", "qualname": "ColBERT.from_pretrained", "kind": "function", "doc": "
\n", "signature": "(cls , output_dir , ** kwargs ): ", "funcdef": "def"}, {"fullname": "debeir.models.colbert.ColBERT.save_pretrained", "modulename": "debeir.models.colbert", "qualname": "ColBERT.save_pretrained", "kind": "function", "doc": "
\n", "signature": "(self , output_dir ): ", "funcdef": "def"}, {"fullname": "debeir.models.colbert.ComBERT", "modulename": "debeir.models.colbert", "qualname": "ComBERT", "kind": "class", "doc": "Base class for all neural network modules.
\n\nYour models should also subclass this class.
\n\nModules can also contain other Modules, allowing to nest them in\na tree structure. You can assign the submodules as regular attributes::
\n\nimport torch.nn as nn\nimport torch.nn.functional as F\n\nclass Model(nn.Module):\n def __init__(self):\n super().__init__()\n self.conv1 = nn.Conv2d(1, 20, 5)\n self.conv2 = nn.Conv2d(20, 20, 5)\n\n def forward(self, x):\n x = F.relu(self.conv1(x))\n return F.relu(self.conv2(x))\n
\n\nSubmodules assigned in this way will be registered, and will have their\nparameters converted too when you call to()
, etc.
\n\n\n\n
As per the example above, an __init__()
call to the parent class\nmust be made before assignment on the child.
\n\n
\n\n:ivar training: Boolean represents whether this module is in training or\n evaluation mode.\n:vartype training: bool
\n", "bases": "torch.nn.modules.module.Module"}, {"fullname": "debeir.models.colbert.ComBERT.__init__", "modulename": "debeir.models.colbert", "qualname": "ComBERT.__init__", "kind": "function", "doc": "Initializes internal Module state, shared by both nn.Module and ScriptModule.
\n", "signature": "(\tbert_model_args , \tbert_model_kwargs , \tconfig : transformers . models . bert . configuration_bert . BertConfig , \tdevice : str , \tmax_seq_len : int = 128 , \tk : int = 8 , \toptional_shortcut : bool = True , \thidden_neurons : int = 2048 , \tuse_batch_norms : bool = True , \tuse_trans_blocks : bool = False , \tresidual_kernel_size : int = 1 , \tdropout_perc : float = 0.5 , \tact_func = 'mish' , \tloss_func = 'cross_entropy_loss' , \tnum_blocks = 2 , \t** kwargs ) "}, {"fullname": "debeir.models.colbert.ComBERT.forward", "modulename": "debeir.models.colbert", "qualname": "ComBERT.forward", "kind": "function", "doc": "Defines the computation performed at every call.
\n\nShould be overridden by all subclasses.
\n\n\n\n
Although the recipe for forward pass needs to be defined within\nthis function, one should call the Module
instance afterwards\ninstead of this since the former takes care of running the\nregistered hooks while the latter silently ignores them.
\n\n
\n", "signature": "(self , * args , ** kwargs ): ", "funcdef": "def"}, {"fullname": "debeir.models.colbert.ComBERT.from_config", "modulename": "debeir.models.colbert", "qualname": "ComBERT.from_config", "kind": "function", "doc": "
\n", "signature": "(cls , * args , config_path ): ", "funcdef": "def"}, {"fullname": "debeir.models.colbert.ComBERT.from_pretrained", "modulename": "debeir.models.colbert", "qualname": "ComBERT.from_pretrained", "kind": "function", "doc": "
\n", "signature": "(cls , output_dir , ** kwargs ): ", "funcdef": "def"}, {"fullname": "debeir.models.colbert.ComBERT.save_pretrained", "modulename": "debeir.models.colbert", "qualname": "ComBERT.save_pretrained", "kind": "function", "doc": "
\n", "signature": "(self , output_dir ): ", "funcdef": "def"}, {"fullname": "debeir.rankers", "modulename": "debeir.rankers", "kind": "module", "doc": "Rankers module.
\n\n\n Includes runnable out-of-box training code\n Custom ranking loss functions (e.g. LambdaLoss, NDCGLoss)\n Includes custom rankers for reranking or NIR-style queries.
\n \n"}, {"fullname": "debeir.rankers.reranking", "modulename": "debeir.rankers.reranking", "kind": "module", "doc": "
\n"}, {"fullname": "debeir.rankers.reranking.nir", "modulename": "debeir.rankers.reranking.nir", "kind": "module", "doc": "NIR Reranker
\n\n[Insert paper link here]
\n"}, {"fullname": "debeir.rankers.reranking.nir.NIReRanker", "modulename": "debeir.rankers.reranking.nir", "qualname": "NIReRanker", "kind": "class", "doc": "Re-ranker which uses the NIR scoring method\n score = log(bm25)/log(z) + cosine_sum
\n", "bases": "debeir.rankers.reranking.reranker.DocumentReRanker"}, {"fullname": "debeir.rankers.reranking.nir.NIReRanker.__init__", "modulename": "debeir.rankers.reranking.nir", "qualname": "NIReRanker.__init__", "kind": "function", "doc": "
\n", "signature": "(\tquery , \tranked_list : List [ debeir . core . document . Document ] , \tencoder : debeir . rankers . transformer_sent_encoder . Encoder , \tdistance_fn =< function cosine > , \tfacets_weights : Dict = None , \tpresort = False , \tfields_to_encode = None , \t* args , \t** kwargs ) "}, {"fullname": "debeir.rankers.reranking.reranker", "modulename": "debeir.rankers.reranking.reranker", "kind": "module", "doc": "General re-ranking interfaces to be implemented by child classes.
\n"}, {"fullname": "debeir.rankers.reranking.reranker.ReRanker", "modulename": "debeir.rankers.reranking.reranker", "qualname": "ReRanker", "kind": "class", "doc": "General interface for a reranking.
\n\nChild classes should implement the abstract methods.
\n"}, {"fullname": "debeir.rankers.reranking.reranker.ReRanker.__init__", "modulename": "debeir.rankers.reranking.reranker", "qualname": "ReRanker.__init__", "kind": "function", "doc": "
\n", "signature": "(query , ranked_list : List , * args , ** kwargs ) "}, {"fullname": "debeir.rankers.reranking.reranker.ReRanker.rerank", "modulename": "debeir.rankers.reranking.reranker", "qualname": "ReRanker.rerank", "kind": "function", "doc": "Re-rank the passed ranked list based on implemented private _compute_scores method.
\n\nParameters \n\n\n\nReturns \n\n\nA ranked list in descending order of the score field (which will be the last item in the list)\n
\n \n", "signature": "(self ) -> List : ", "funcdef": "def"}, {"fullname": "debeir.rankers.reranking.reranker.DocumentReRanker", "modulename": "debeir.rankers.reranking.reranker", "qualname": "DocumentReRanker", "kind": "class", "doc": "Reranking interface for a ranked list of Document objects.
\n", "bases": "ReRanker"}, {"fullname": "debeir.rankers.reranking.reranker.DocumentReRanker.__init__", "modulename": "debeir.rankers.reranking.reranker", "qualname": "DocumentReRanker.__init__", "kind": "function", "doc": "
\n", "signature": "(\tquery , \tranked_list : List [ debeir . core . document . Document ] , \t* args , \t** kwargs ) "}, {"fullname": "debeir.rankers.reranking.reranker.ReRankerPool", "modulename": "debeir.rankers.reranking.reranker", "qualname": "ReRankerPool", "kind": "class", "doc": "
\n"}, {"fullname": "debeir.rankers.reranking.reranker.ReRankerPool.__init__", "modulename": "debeir.rankers.reranking.reranker", "qualname": "ReRankerPool.__init__", "kind": "function", "doc": "
\n", "signature": "() "}, {"fullname": "debeir.rankers.reranking.use", "modulename": "debeir.rankers.reranking.use", "kind": "module", "doc": "
\n"}, {"fullname": "debeir.rankers.reranking.use.USEReRanker", "modulename": "debeir.rankers.reranking.use", "qualname": "USEReRanker", "kind": "class", "doc": "Re-ranks based on the cosine_sum rather the complete NIR scoring
\n", "bases": "debeir.rankers.reranking.nir.NIReRanker"}, {"fullname": "debeir.rankers.reranking.use.USEReRanker.__init__", "modulename": "debeir.rankers.reranking.use", "qualname": "USEReRanker.__init__", "kind": "function", "doc": "
\n", "signature": "(* args , ** kwargs ) "}, {"fullname": "debeir.rankers.transformer_sent_encoder", "modulename": "debeir.rankers.transformer_sent_encoder", "kind": "module", "doc": "
\n"}, {"fullname": "debeir.rankers.transformer_sent_encoder.Encoder", "modulename": "debeir.rankers.transformer_sent_encoder", "qualname": "Encoder", "kind": "class", "doc": "A wrapper for the Sentence Transformer Encoder used in Universal Sentence Embeddings (USE) for ranking or reranking.
\n\nParameters \n\n\nmodel_path : The path to a sentence transformer or transformer model. \nnormalize : Normalize the output vectors to unit length for dot product retrieval rather than cosine. \nspacy_model : the spacy or scispacy model to use for sentence boundary detection. \nmax_length : Maximum input length for the spacy nlp model. \n \n"}, {"fullname": "debeir.rankers.transformer_sent_encoder.Encoder.__init__", "modulename": "debeir.rankers.transformer_sent_encoder", "qualname": "Encoder.__init__", "kind": "function", "doc": "
\n", "signature": "(\tmodel_path , \tnormalize = False , \tspacy_model = 'en_core_sci_md' , \tmax_length = 2000000 ) "}, {"fullname": "debeir.rankers.transformer_sent_encoder.Encoder.encode", "modulename": "debeir.rankers.transformer_sent_encoder", "qualname": "Encoder.encode", "kind": "function", "doc": "Computes sentence embeddings for a given topic, uses spacy for sentence segmentation.\nBy default, uses a cache to store previously computed vectors. Pass \"disable_cache\" as a kwarg to disable this.
\n\nParameters \n\n\ntopic : The topic (a list of sentences) to encode. Should be a raw string. \ndisable_cache : keyword argument, pass as True to disable encoding caching. \n \n\nReturns \n\n\nReturns a list of encoded tensors is returned.\n
\n \n", "signature": "(self , topic : str ) -> List : ", "funcdef": "def"}, {"fullname": "debeir.training", "modulename": "debeir.training", "kind": "module", "doc": "Runnable out-of-the-box code for training re-rankers.
\n"}, {"fullname": "debeir.training.evaluate_reranker", "modulename": "debeir.training.evaluate_reranker", "kind": "module", "doc": "
\n"}, {"fullname": "debeir.training.evaluate_reranker.SentenceEvaluator", "modulename": "debeir.training.evaluate_reranker", "qualname": "SentenceEvaluator", "kind": "class", "doc": "Evaluation class for computing metrics from TREC-style files
\n", "bases": "debeir.evaluation.evaluator.Evaluator"}, {"fullname": "debeir.training.evaluate_reranker.SentenceEvaluator.__init__", "modulename": "debeir.training.evaluate_reranker", "qualname": "SentenceEvaluator.__init__", "kind": "function", "doc": "
\n", "signature": "(\tmodel : debeir . rankers . transformer_sent_encoder . Encoder , \tdataset : datasets . arrow_dataset . Dataset , \tparsed_topics : Dict [ Union [ str , int ], Dict ] , \ttext_cols : List [ str ] , \tquery_cols : List [ str ] , \tid_col : str , \tdistance_fn : str , \tqrels : str , \tmetrics : List [ str ] ) "}, {"fullname": "debeir.training.evaluate_reranker.SentenceEvaluator.produce_ranked_lists", "modulename": "debeir.training.evaluate_reranker", "qualname": "SentenceEvaluator.produce_ranked_lists", "kind": "function", "doc": "
\n", "signature": "(self ): ", "funcdef": "def"}, {"fullname": "debeir.training.hparm_tuning", "modulename": "debeir.training.hparm_tuning", "kind": "module", "doc": "Hyper parameter tuning library using Optuna and Wandb
\n"}, {"fullname": "debeir.training.hparm_tuning.config", "modulename": "debeir.training.hparm_tuning.config", "kind": "module", "doc": "
\n"}, {"fullname": "debeir.training.hparm_tuning.config.HparamConfig", "modulename": "debeir.training.hparm_tuning.config", "qualname": "HparamConfig", "kind": "class", "doc": "Hyperparameter configuration file
\n\nExpects a dictionary of hyperparameters
\n\nhparams: Dict\n{\n \"learning_rate\": {\n \"type\": float\n \"low\": 0.1\n \"high\": 1.0\n \"step\": 0.1\n # OR\n args: [0.1, 1.0, 0.1]\n },\n}
\n", "bases": "debeir.core.config.Config"}, {"fullname": "debeir.training.hparm_tuning.config.HparamConfig.__init__", "modulename": "debeir.training.hparm_tuning.config", "qualname": "HparamConfig.__init__", "kind": "function", "doc": "
\n", "signature": "(hparams : Dict [ str , Dict ] ) "}, {"fullname": "debeir.training.hparm_tuning.config.HparamConfig.from_json", "modulename": "debeir.training.hparm_tuning.config", "qualname": "HparamConfig.from_json", "kind": "function", "doc": "
\n", "signature": "(cls , fp ) -> debeir . training . hparm_tuning . config . HparamConfig : ", "funcdef": "def"}, {"fullname": "debeir.training.hparm_tuning.config.HparamConfig.validate", "modulename": "debeir.training.hparm_tuning.config", "qualname": "HparamConfig.validate", "kind": "function", "doc": "Validates if the config is correct.\nMust be implemented by inherited classes.
\n", "signature": "(self ): ", "funcdef": "def"}, {"fullname": "debeir.training.hparm_tuning.config.HparamConfig.parse_config_to_py", "modulename": "debeir.training.hparm_tuning.config", "qualname": "HparamConfig.parse_config_to_py", "kind": "function", "doc": "Parses configuration file into usable python objects
\n", "signature": "(self ): ", "funcdef": "def"}, {"fullname": "debeir.training.hparm_tuning.optuna_rank", "modulename": "debeir.training.hparm_tuning.optuna_rank", "kind": "module", "doc": "
\n"}, {"fullname": "debeir.training.hparm_tuning.optuna_rank.objective", "modulename": "debeir.training.hparm_tuning.optuna_rank", "qualname": "objective", "kind": "function", "doc": "
\n", "signature": "(\ttrainer : debeir . training . hparm_tuning . trainer . Trainer , \ttrial : optuna . trial . _trial . Trial ): ", "funcdef": "def"}, {"fullname": "debeir.training.hparm_tuning.optuna_rank.run_optuna_with_wandb", "modulename": "debeir.training.hparm_tuning.optuna_rank", "qualname": "run_optuna_with_wandb", "kind": "function", "doc": "Partially initialize the objective function with a trainer and hparams to optimize.
\n\nOptimize using the optuna library.
\n\nParameters \n\n\ntrainer : \nn_trials : \nmaximize_objective : \nwandb_kwargs : \n \n\nReturns \n", "signature": "(\ttrainer , \tn_trials = 100 , \tn_jobs = 1 , \tmaximize_objective = True , \tsave_study_path = '.' , \twandb_kwargs = None ): ", "funcdef": "def"}, {"fullname": "debeir.training.hparm_tuning.optuna_rank.print_optuna_stats", "modulename": "debeir.training.hparm_tuning.optuna_rank", "qualname": "print_optuna_stats", "kind": "function", "doc": "
\n", "signature": "(study : optuna . study . study . Study ): ", "funcdef": "def"}, {"fullname": "debeir.training.hparm_tuning.trainer", "modulename": "debeir.training.hparm_tuning.trainer", "kind": "module", "doc": "
\n"}, {"fullname": "debeir.training.hparm_tuning.trainer.OptimizersWrapper", "modulename": "debeir.training.hparm_tuning.trainer", "qualname": "OptimizersWrapper", "kind": "class", "doc": "
\n"}, {"fullname": "debeir.training.hparm_tuning.trainer.OptimizersWrapper.__init__", "modulename": "debeir.training.hparm_tuning.trainer", "qualname": "OptimizersWrapper.__init__", "kind": "function", "doc": "
\n", "signature": "() "}, {"fullname": "debeir.training.hparm_tuning.trainer.Trainer", "modulename": "debeir.training.hparm_tuning.trainer", "qualname": "Trainer", "kind": "class", "doc": "Wrapper class for a trainer class.
\n"}, {"fullname": "debeir.training.hparm_tuning.trainer.Trainer.__init__", "modulename": "debeir.training.hparm_tuning.trainer", "qualname": "Trainer.__init__", "kind": "function", "doc": "
\n", "signature": "(model , evaluator_fn , dataset_loading_fn ) "}, {"fullname": "debeir.training.hparm_tuning.trainer.Trainer.fit", "modulename": "debeir.training.hparm_tuning.trainer", "qualname": "Trainer.fit", "kind": "function", "doc": "
\n", "signature": "(\tself , \tin_trial : optuna . trial . _trial . Trial , \ttrain_dataset , \tval_dataset ): ", "funcdef": "def"}, {"fullname": "debeir.training.hparm_tuning.trainer.SentenceTransformerHparamTrainer", "modulename": "debeir.training.hparm_tuning.trainer", "qualname": "SentenceTransformerHparamTrainer", "kind": "class", "doc": "See Optuna documentation for types!
\n", "bases": "Trainer"}, {"fullname": "debeir.training.hparm_tuning.trainer.SentenceTransformerHparamTrainer.__init__", "modulename": "debeir.training.hparm_tuning.trainer", "qualname": "SentenceTransformerHparamTrainer.__init__", "kind": "function", "doc": "
\n", "signature": "(\tdataset_loading_fn , \tevaluator_fn , \thparams_config : debeir . training . hparm_tuning . config . HparamConfig ) "}, {"fullname": "debeir.training.hparm_tuning.trainer.SentenceTransformerHparamTrainer.get_optuna_hparams", "modulename": "debeir.training.hparm_tuning.trainer", "qualname": "SentenceTransformerHparamTrainer.get_optuna_hparams", "kind": "function", "doc": "Get hyperparameters suggested by the optuna library
\n\nParameters \n\n\ntrial : The optuna trial object \nhparams : Optional, pass a dictionary of HparamType[Enum] objects \n \n\nReturns \n", "signature": "(\tself , \ttrial : optuna . trial . _trial . Trial , \thparams : Sequence [ debeir . training . hparm_tuning . types . Hparam ] = None ): ", "funcdef": "def"}, {"fullname": "debeir.training.hparm_tuning.trainer.SentenceTransformerHparamTrainer.build_kwargs_and_model", "modulename": "debeir.training.hparm_tuning.trainer", "qualname": "SentenceTransformerHparamTrainer.build_kwargs_and_model", "kind": "function", "doc": "
\n", "signature": "(self , hparams : Dict ): ", "funcdef": "def"}, {"fullname": "debeir.training.hparm_tuning.trainer.SentenceTransformerHparamTrainer.fit", "modulename": "debeir.training.hparm_tuning.trainer", "qualname": "SentenceTransformerHparamTrainer.fit", "kind": "function", "doc": "
\n", "signature": "(\tself , \tin_trial : optuna . trial . _trial . Trial , \ttrain_dataset , \tval_dataset ): ", "funcdef": "def"}, {"fullname": "debeir.training.hparm_tuning.trainer.trial_callback", "modulename": "debeir.training.hparm_tuning.trainer", "qualname": "trial_callback", "kind": "function", "doc": "
\n", "signature": "(trial , score , epoch , * args , ** kwargs ): ", "funcdef": "def"}, {"fullname": "debeir.training.hparm_tuning.trainer.SentenceTransformerTrainer", "modulename": "debeir.training.hparm_tuning.trainer", "qualname": "SentenceTransformerTrainer", "kind": "class", "doc": "See Optuna documentation for types!
\n", "bases": "SentenceTransformerHparamTrainer"}, {"fullname": "debeir.training.hparm_tuning.trainer.SentenceTransformerTrainer.__init__", "modulename": "debeir.training.hparm_tuning.trainer", "qualname": "SentenceTransformerTrainer.__init__", "kind": "function", "doc": "
\n", "signature": "(\tdataset : Union [ datasets . dataset_dict . DatasetDict , Dict [ str , datasets . arrow_dataset . Dataset ]] , \thparams_config : debeir . training . hparm_tuning . config . HparamConfig , \tevaluator_fn = None , \tevaluator = None , \tuse_wandb = False ) "}, {"fullname": "debeir.training.hparm_tuning.trainer.SentenceTransformerTrainer.fit", "modulename": "debeir.training.hparm_tuning.trainer", "qualname": "SentenceTransformerTrainer.fit", "kind": "function", "doc": "
\n", "signature": "(self , ** extra_kwargs ): ", "funcdef": "def"}, {"fullname": "debeir.training.hparm_tuning.types", "modulename": "debeir.training.hparm_tuning.types", "kind": "module", "doc": "
\n"}, {"fullname": "debeir.training.hparm_tuning.types.Hparam", "modulename": "debeir.training.hparm_tuning.types", "qualname": "Hparam", "kind": "class", "doc": "
\n"}, {"fullname": "debeir.training.hparm_tuning.types.Hparam.__init__", "modulename": "debeir.training.hparm_tuning.types", "qualname": "Hparam.__init__", "kind": "function", "doc": "
\n", "signature": "() "}, {"fullname": "debeir.training.hparm_tuning.types.Hparam.suggest", "modulename": "debeir.training.hparm_tuning.types", "qualname": "Hparam.suggest", "kind": "function", "doc": "
\n", "signature": "(self , * args , ** kwargs ): ", "funcdef": "def"}, {"fullname": "debeir.training.hparm_tuning.types.HparamFloat", "modulename": "debeir.training.hparm_tuning.types", "qualname": "HparamFloat", "kind": "class", "doc": "
\n", "bases": "Hparam"}, {"fullname": "debeir.training.hparm_tuning.types.HparamFloat.__init__", "modulename": "debeir.training.hparm_tuning.types", "qualname": "HparamFloat.__init__", "kind": "function", "doc": "
\n", "signature": "(\tname : str , \tlow : float , \thigh : float , \tlog : bool = False , \tstep : float = None ) "}, {"fullname": "debeir.training.hparm_tuning.types.HparamFloat.suggest", "modulename": "debeir.training.hparm_tuning.types", "qualname": "HparamFloat.suggest", "kind": "function", "doc": "
\n", "signature": "(self , trial : optuna . trial . _trial . Trial ): ", "funcdef": "def"}, {"fullname": "debeir.training.hparm_tuning.types.HparamInt", "modulename": "debeir.training.hparm_tuning.types", "qualname": "HparamInt", "kind": "class", "doc": "
\n", "bases": "Hparam"}, {"fullname": "debeir.training.hparm_tuning.types.HparamInt.__init__", "modulename": "debeir.training.hparm_tuning.types", "qualname": "HparamInt.__init__", "kind": "function", "doc": "
\n", "signature": "(name : str , low : int , high : int , log : bool = False , step : int = 1 ) "}, {"fullname": "debeir.training.hparm_tuning.types.HparamInt.suggest", "modulename": "debeir.training.hparm_tuning.types", "qualname": "HparamInt.suggest", "kind": "function", "doc": "
\n", "signature": "(self , trial : optuna . trial . _trial . Trial ): ", "funcdef": "def"}, {"fullname": "debeir.training.hparm_tuning.types.HparamCategorical", "modulename": "debeir.training.hparm_tuning.types", "qualname": "HparamCategorical", "kind": "class", "doc": "
\n", "bases": "Hparam"}, {"fullname": "debeir.training.hparm_tuning.types.HparamCategorical.__init__", "modulename": "debeir.training.hparm_tuning.types", "qualname": "HparamCategorical.__init__", "kind": "function", "doc": "
\n", "signature": "(name : str , choices : Sequence , func : str = 'suggest_categorical' ) "}, {"fullname": "debeir.training.hparm_tuning.types.HparamCategorical.suggest", "modulename": "debeir.training.hparm_tuning.types", "qualname": "HparamCategorical.suggest", "kind": "function", "doc": "
\n", "signature": "(self , trial : optuna . trial . _trial . Trial ): ", "funcdef": "def"}, {"fullname": "debeir.training.hparm_tuning.types.HparamUniform", "modulename": "debeir.training.hparm_tuning.types", "qualname": "HparamUniform", "kind": "class", "doc": "
\n", "bases": "Hparam"}, {"fullname": "debeir.training.hparm_tuning.types.HparamUniform.__init__", "modulename": "debeir.training.hparm_tuning.types", "qualname": "HparamUniform.__init__", "kind": "function", "doc": "
\n", "signature": "(name : str , low : float , high : float , func : str = 'suggest_uniform' ) "}, {"fullname": "debeir.training.hparm_tuning.types.HparamUniform.suggest", "modulename": "debeir.training.hparm_tuning.types", "qualname": "HparamUniform.suggest", "kind": "function", "doc": "
\n", "signature": "(self , trial : optuna . trial . _trial . Trial ): ", "funcdef": "def"}, {"fullname": "debeir.training.hparm_tuning.types.HparamLogUniform", "modulename": "debeir.training.hparm_tuning.types", "qualname": "HparamLogUniform", "kind": "class", "doc": "
\n", "bases": "Hparam"}, {"fullname": "debeir.training.hparm_tuning.types.HparamLogUniform.__init__", "modulename": "debeir.training.hparm_tuning.types", "qualname": "HparamLogUniform.__init__", "kind": "function", "doc": "
\n", "signature": "(name : str , low : float , high : float , func : str = 'suggest_loguniform' ) "}, {"fullname": "debeir.training.hparm_tuning.types.HparamLogUniform.suggest", "modulename": "debeir.training.hparm_tuning.types", "qualname": "HparamLogUniform.suggest", "kind": "function", "doc": "
\n", "signature": "(self , trial : optuna . trial . _trial . Trial ): ", "funcdef": "def"}, {"fullname": "debeir.training.hparm_tuning.types.HparamDiscreteUniform", "modulename": "debeir.training.hparm_tuning.types", "qualname": "HparamDiscreteUniform", "kind": "class", "doc": "
\n", "bases": "Hparam"}, {"fullname": "debeir.training.hparm_tuning.types.HparamDiscreteUniform.__init__", "modulename": "debeir.training.hparm_tuning.types", "qualname": "HparamDiscreteUniform.__init__", "kind": "function", "doc": "
\n", "signature": "(\tname : str , \tlow : float , \thigh : float , \tq : float , \tfunc : str = 'suggest_discrete_uniform' ) "}, {"fullname": "debeir.training.hparm_tuning.types.HparamDiscreteUniform.suggest", "modulename": "debeir.training.hparm_tuning.types", "qualname": "HparamDiscreteUniform.suggest", "kind": "function", "doc": "
\n", "signature": "(self , trial : optuna . trial . _trial . Trial ): ", "funcdef": "def"}, {"fullname": "debeir.training.losses", "modulename": "debeir.training.losses", "kind": "module", "doc": "
\n"}, {"fullname": "debeir.training.losses.contrastive", "modulename": "debeir.training.losses.contrastive", "kind": "module", "doc": "Author: Yonglong Tian (yonglong@mit.edu)\nDate: May 07, 2020
\n\nCode imported from: https://github.com/HobbitLong/SupContrast/blob/master/losses.py
\n"}, {"fullname": "debeir.training.losses.contrastive.SupConLoss", "modulename": "debeir.training.losses.contrastive", "qualname": "SupConLoss", "kind": "class", "doc": "Supervised Contrastive Learning: https://arxiv.org/pdf/2004.11362.pdf .\nIt also supports the unsupervised contrastive loss in SimCLR
\n", "bases": "torch.nn.modules.module.Module"}, {"fullname": "debeir.training.losses.contrastive.SupConLoss.__init__", "modulename": "debeir.training.losses.contrastive", "qualname": "SupConLoss.__init__", "kind": "function", "doc": "Initializes internal Module state, shared by both nn.Module and ScriptModule.
\n", "signature": "(temperature = 1.0 , contrast_mode = 'all' , base_temperature = 1.0 ) "}, {"fullname": "debeir.training.losses.contrastive.SupConLoss.forward", "modulename": "debeir.training.losses.contrastive", "qualname": "SupConLoss.forward", "kind": "function", "doc": "Compute loss for model. If both labels
and mask
are None,\nit degenerates to SimCLR unsupervised loss:\nhttps://arxiv.org/pdf/2002.05709.pdf \nArgs:\n features: hidden vector of shape [bsz, n_views, ...].\n labels: ground truth of shape [bsz].\n mask: contrastive mask of shape [bsz, bsz], mask_{i,j}=1 if sample j\n has the same class as sample i. Can be asymmetric.\nReturns:\n A loss scalar.
\n", "signature": "(self , features , labels = None , mask = None ): ", "funcdef": "def"}, {"fullname": "debeir.training.losses.contrastive.SiameseDistanceMetric", "modulename": "debeir.training.losses.contrastive", "qualname": "SiameseDistanceMetric", "kind": "class", "doc": "The metric for the contrastive loss
\n", "bases": "enum.Enum"}, {"fullname": "debeir.training.losses.contrastive.SiameseDistanceMetric.EUCLIDEAN", "modulename": "debeir.training.losses.contrastive", "qualname": "SiameseDistanceMetric.EUCLIDEAN", "kind": "function", "doc": "
\n", "signature": "(x , y ): ", "funcdef": "def"}, {"fullname": "debeir.training.losses.contrastive.SiameseDistanceMetric.MANHATTAN", "modulename": "debeir.training.losses.contrastive", "qualname": "SiameseDistanceMetric.MANHATTAN", "kind": "function", "doc": "
\n", "signature": "(x , y ): ", "funcdef": "def"}, {"fullname": "debeir.training.losses.contrastive.SiameseDistanceMetric.COSINE_DISTANCE", "modulename": "debeir.training.losses.contrastive", "qualname": "SiameseDistanceMetric.COSINE_DISTANCE", "kind": "function", "doc": "
\n", "signature": "(x , y ): ", "funcdef": "def"}, {"fullname": "debeir.training.losses.contrastive.ContrastiveSentLoss", "modulename": "debeir.training.losses.contrastive", "qualname": "ContrastiveSentLoss", "kind": "class", "doc": "Contrastive loss. Expects as input two texts and a label of either 0 or 1. If the label == 1, then the distance between the\ntwo embeddings is reduced. If the label == 0, then the distance between the embeddings is increased.\nFurther information: http://yann.lecun.com/exdb/publis/pdf/hadsell-chopra-lecun-06.pdf
\n\nParameters \n\n\nmodel : SentenceTransformer model \ndistance_metric : Function that returns a distance between two emeddings. The class SiameseDistanceMetric contains pre-defined metrices that can be used \nmargin : Negative samples (label == 0) should have a distance of at least the margin value. \nsize_average : Average by the size of the mini-batch.\nExample::\nfrom sentence_transformers import SentenceTransformer, LoggingHandler, losses, InputExample\nfrom torch.utils.data import DataLoader\nmodel = SentenceTransformer('all-MiniLM-L6-v2')\ntrain_examples = [\n InputExample(texts=['This is a positive pair', 'Where the distance will be minimized'], label=1),\n InputExample(texts=['This is a negative pair', 'Their distance will be increased'], label=0)]\ntrain_dataloader = DataLoader(train_examples, shuffle=True, batch_size=2)\ntrain_loss = losses.ContrastiveLoss(model=model)\nmodel.fit([(train_dataloader, train_loss)], show_progress_bar=True) \n \n", "bases": "torch.nn.modules.module.Module"}, {"fullname": "debeir.training.losses.contrastive.ContrastiveSentLoss.__init__", "modulename": "debeir.training.losses.contrastive", "qualname": "ContrastiveSentLoss.__init__", "kind": "function", "doc": "Initializes internal Module state, shared by both nn.Module and ScriptModule.
\n", "signature": "(\tmodel , \tdistance_metric =< function SiameseDistanceMetric .< lambda >> , \tmargin : float = 0.5 , \tsize_average : bool = True ) "}, {"fullname": "debeir.training.losses.contrastive.ContrastiveSentLoss.get_config_dict", "modulename": "debeir.training.losses.contrastive", "qualname": "ContrastiveSentLoss.get_config_dict", "kind": "function", "doc": "
\n", "signature": "(self ): ", "funcdef": "def"}, {"fullname": "debeir.training.losses.contrastive.ContrastiveSentLoss.forward", "modulename": "debeir.training.losses.contrastive", "qualname": "ContrastiveSentLoss.forward", "kind": "function", "doc": "Defines the computation performed at every call.
\n\nShould be overridden by all subclasses.
\n\n\n\n
Although the recipe for forward pass needs to be defined within\nthis function, one should call the Module
instance afterwards\ninstead of this since the former takes care of running the\nregistered hooks while the latter silently ignores them.
\n\n
\n", "signature": "(\tself , \tsentence_features : Iterable [ Dict [ str , torch . Tensor ]] , \tlabels : torch . Tensor ): ", "funcdef": "def"}, {"fullname": "debeir.training.losses.ranking", "modulename": "debeir.training.losses.ranking", "kind": "module", "doc": "Losses are drawn from the allrank library
\n"}, {"fullname": "debeir.training.train_reranker", "modulename": "debeir.training.train_reranker", "kind": "module", "doc": "
\n"}, {"fullname": "debeir.training.train_reranker.train_cross_encoder_reranker", "modulename": "debeir.training.train_reranker", "qualname": "train_cross_encoder_reranker", "kind": "function", "doc": "Trains a reranker with relevance signals
\n\nParameters \n\n\nmodel_fp_or_name : The model name or path to the model \noutput_dir : Output directory to save model, logs etc. \ntrain_dataset : Training Examples \ndev_dataset : Dev examples \ntrain_batch_size : Training batch size \nnum_epochs : Number of epochs \nwarmup_steps : Warmup steps for the scheduler \nevaluate_every_n_step : Evaluate the model every n steps \nspecial_tokens : Special tokens to add, defaults to [DOC], [QRY] tokens (bi-encoder) \npooling_mode : Pooling mode for a sentence transformer model \nloss_func : Loss function(s) to use \nevaluator : Evaluator to use \n \n", "signature": "(\tmodel_fp_or_name : str , \toutput_dir : str , \ttrain_dataset : List [ debeir . datasets . types . RelevanceExample ] , \tdev_dataset : List [ debeir . datasets . types . RelevanceExample ] , \ttrain_batch_size = 32 , \tnum_epochs = 3 , \twarmup_steps = None , \tevaluate_every_n_step : int = 1000 , \tspecial_tokens = None , \tpooling_mode = None , \tloss_func = None , \tevaluator : sentence_transformers . evaluation . SentenceEvaluator . SentenceEvaluator = None , \t* args , \t** kwargs ): ", "funcdef": "def"}, {"fullname": "debeir.training.train_sentence_encoder", "modulename": "debeir.training.train_sentence_encoder", "kind": "module", "doc": "
\n"}, {"fullname": "debeir.training.train_sentence_encoder.train_biencoder", "modulename": "debeir.training.train_sentence_encoder", "qualname": "train_biencoder", "kind": "function", "doc": "Train a universal sentence encoder
\n\nParameters \n\n\nmodel_fp_or_name : The model name or path to the model \noutput_dir : Output directory to save model, logs etc. \ntrain_examples : Training Examples \ndev_examples : Dev examples \ntrain_batch_size : Training batch size \nnum_epochs : Number of epochs \nwarmup_steps : Warmup steps for the scheduler \nevaluate_every_n_step : Evaluate the model every n steps \nspecial_tokens : Special tokens to add \npooling_mode : Pooling mode for a sentence transformer model \nloss_func : Loss function(s) to use \nevaluator : Evaluator to use \n \n", "signature": "(\tmodel_fp_or_name : str , \toutput_dir : str , \ttrain_examples : List [ debeir . datasets . types . InputExample ] , \tdev_examples : List [ debeir . datasets . types . InputExample ] , \ttrain_batch_size = 32 , \tnum_epochs = 3 , \twarmup_steps = None , \tevaluate_every_n_step : int = 1000 , \tspecial_tokens = None , \tpooling_mode = None , \tloss_func = None , \tevaluator : sentence_transformers . evaluation . SentenceEvaluator . SentenceEvaluator = None , \t* args , \t** kwargs ): ", "funcdef": "def"}, {"fullname": "debeir.training.train_sentence_encoder.train_huggingface_transformer", "modulename": "debeir.training.train_sentence_encoder", "qualname": "train_huggingface_transformer", "kind": "function", "doc": "Train a transformer model using the Huggingface API
\n\nParameters \n\n\nmodel_fp_or_name_or_cls : Model name or model class to instantiate \ntokenizer : Tokenizer \noutput_dir : Output directory to write to \ncompute_metric_fn : Metric function to compute metrics \nmetric : Metric used by the compute_metric_fn \ndataset : Huggingface Dataset Dict \ntrain_dataset : Training dataset to be used by the Trainer class \neval_dataset : Evaluation dataset to be used by the Trainer class \ntrain_batch_size : Batch size to use for training \nnum_epochs: Number of training epochs (default : 3) \nlearning_rate: Learning rate (default : 5e-5) \nlr_scheduler_type : Learning rate type, see SchedulerType \noptimizer : Optimizer \nwarmup_ratio : Warmup ratios as ratio of steps (default 0.1) \nevaluate_every_n_step : Number of steps to evaluate \npooling_mode : Pooling mode for your model \nloss_func : Loss function to instantiate model \nmodel_args : Model arguments to pass \nmodel_kwargs : Model keyword arguments \npadding_strategy : Tokenization padding strategy \ntruncate : Truncate tokenization strategy \nspecial_tokens : Special tokens to add to the tokenizer \nseed : Dataset shuffle seed \nargs : \nkwargs : \n \n\nReturns \n", "signature": "(\tmodel_fp_or_name_or_cls : Union [ str , transformers . modeling_utils . PreTrainedModel ] , \ttokenizer : transformers . tokenization_utils . PreTrainedTokenizer , \toutput_dir : str , \tcompute_metric_fn , \tmetric : datasets . metric . Metric , \tdataset : datasets . dataset_dict . DatasetDict = None , \ttrain_dataset : List [ Union [ debeir . datasets . types . RelevanceExample , debeir . datasets . types . InputExample , datasets . arrow_dataset . Dataset ]] = None , \teval_dataset : List [ Union [ debeir . datasets . types . RelevanceExample , debeir . datasets . types . InputExample , datasets . arrow_dataset . Dataset ]] = None , \ttrain_batch_size = 32 , \tnum_epochs = 3 , \tlearning_rate = 5e-05 , \tlr_scheduler_type : transformers . trainer_utils . SchedulerType = < SchedulerType . CONSTANT_WITH_WARMUP : 'constant_with_warmup' > , \toptimizer : str = 'adamw_hf' , \twarmup_ratio = 0.1 , \tevaluate_every_n_step : int = 1000 , \tpooling_mode = None , \tloss_func = None , \tmodel_args = None , \tmodel_kwargs = None , \tpadding_strategy = 'max_length' , \ttruncate = True , \tspecial_tokens = None , \tseed = 42 , \t* args , \t** kwargs ) -> transformers . trainer . Trainer : ", "funcdef": "def"}, {"fullname": "debeir.training.utils", "modulename": "debeir.training.utils", "kind": "module", "doc": "
\n"}, {"fullname": "debeir.training.utils.LoggingScheduler", "modulename": "debeir.training.utils", "qualname": "LoggingScheduler", "kind": "class", "doc": "
\n"}, {"fullname": "debeir.training.utils.LoggingScheduler.__init__", "modulename": "debeir.training.utils", "qualname": "LoggingScheduler.__init__", "kind": "function", "doc": "
\n", "signature": "(scheduler : torch . optim . lr_scheduler . LambdaLR ) "}, {"fullname": "debeir.training.utils.LoggingScheduler.step", "modulename": "debeir.training.utils", "qualname": "LoggingScheduler.step", "kind": "function", "doc": "
\n", "signature": "(self , epoch = None ): ", "funcdef": "def"}, {"fullname": "debeir.training.utils.get_scheduler_with_wandb", "modulename": "debeir.training.utils", "qualname": "get_scheduler_with_wandb", "kind": "function", "doc": "Returns the correct learning rate scheduler. Available scheduler: constantlr, warmupconstant, warmuplinear, warmupcosine, warmupcosinewithhardrestarts
\n", "signature": "(optimizer , scheduler : str , warmup_steps : int , t_total : int ): ", "funcdef": "def"}, {"fullname": "debeir.training.utils.LoggingLoss", "modulename": "debeir.training.utils", "qualname": "LoggingLoss", "kind": "class", "doc": "
\n"}, {"fullname": "debeir.training.utils.LoggingLoss.__init__", "modulename": "debeir.training.utils", "qualname": "LoggingLoss.__init__", "kind": "function", "doc": "
\n", "signature": "(loss_fn ) "}, {"fullname": "debeir.training.utils.TokenizerOverload", "modulename": "debeir.training.utils", "qualname": "TokenizerOverload", "kind": "class", "doc": "
\n"}, {"fullname": "debeir.training.utils.TokenizerOverload.__init__", "modulename": "debeir.training.utils", "qualname": "TokenizerOverload.__init__", "kind": "function", "doc": "
\n", "signature": "(tokenizer , tokenizer_kwargs , debug = False ) "}, {"fullname": "debeir.training.utils.LoggingEvaluator", "modulename": "debeir.training.utils", "qualname": "LoggingEvaluator", "kind": "class", "doc": "
\n"}, {"fullname": "debeir.training.utils.LoggingEvaluator.__init__", "modulename": "debeir.training.utils", "qualname": "LoggingEvaluator.__init__", "kind": "function", "doc": "
\n", "signature": "(evaluator ) "}, {"fullname": "debeir.training.utils.SentDataset", "modulename": "debeir.training.utils", "qualname": "SentDataset", "kind": "class", "doc": "
\n"}, {"fullname": "debeir.training.utils.SentDataset.__init__", "modulename": "debeir.training.utils", "qualname": "SentDataset.__init__", "kind": "function", "doc": "
\n", "signature": "(\tdataset : datasets . arrow_dataset . Dataset , \ttext_cols : List [ str ] , \tlabel_col : str = None , \tlabel = None ) "}, {"fullname": "debeir.training.utils.SentDatasetList", "modulename": "debeir.training.utils", "qualname": "SentDatasetList", "kind": "class", "doc": "
\n"}, {"fullname": "debeir.training.utils.SentDatasetList.__init__", "modulename": "debeir.training.utils", "qualname": "SentDatasetList.__init__", "kind": "function", "doc": "
\n", "signature": "(datasets : List [ debeir . training . utils . SentDataset ] ) "}, {"fullname": "debeir.training.utils.tokenize_function", "modulename": "debeir.training.utils", "qualname": "tokenize_function", "kind": "function", "doc": "Tokenizer function
\n\nParameters \n\n\ntokenizer : Tokenizer \nexamples : Input examples to tokenize \npadding_strategy : Padding strategy \ntruncate : Truncate sentences \n \n\nReturns \n\n\nReturns a list of tokenized examples\n
\n \n", "signature": "(tokenizer , examples , padding_strategy , truncate ): ", "funcdef": "def"}, {"fullname": "debeir.training.utils.get_max_seq_length", "modulename": "debeir.training.utils", "qualname": "get_max_seq_length", "kind": "function", "doc": "
\n", "signature": "(tokenizer , dataset , x_labels , dataset_key = 'train' ): ", "funcdef": "def"}, {"fullname": "debeir.utils", "modulename": "debeir.utils", "kind": "module", "doc": "Common utilities such as score normalization and creating output directory w/ checks
\n"}, {"fullname": "debeir.utils.scaler", "modulename": "debeir.utils.scaler", "kind": "module", "doc": "
\n"}, {"fullname": "debeir.utils.scaler.unpack_elasticsearch_scores", "modulename": "debeir.utils.scaler", "qualname": "unpack_elasticsearch_scores", "kind": "function", "doc": "Helper function to retrieve the top score of documents for each topic.\nUsed in NIR weight adjustment calculation.
\n\nParameters \n\n\nresults : Raw input of results from Elasticsearch library \n \n\nReturns \n\n\nReturns a 1-D dictionary of {topic_num: top_score} pairs.\n
\n \n", "signature": "(results ) -> Dict : ", "funcdef": "def"}, {"fullname": "debeir.utils.scaler.get_z_value", "modulename": "debeir.utils.scaler", "qualname": "get_z_value", "kind": "function", "doc": "Analytical solution for the normalization constant, z, used in NIR log normalization.
\n\nParameters \n\n\ncosine_ceiling : The highest theoretical additive cosine score \nbm25_ceiling : The highest BM25 score retrieved from a given topic OR an estimate. \n \n\nReturns \n\n\nThe normalization parameter for NIR log normalization.\n
\n \n", "signature": "(cosine_ceiling , bm25_ceiling ) -> float : ", "funcdef": "def"}, {"fullname": "debeir.utils.utils", "modulename": "debeir.utils.utils", "kind": "module", "doc": "
\n"}, {"fullname": "debeir.utils.utils.create_output_file", "modulename": "debeir.utils.utils", "qualname": "create_output_file", "kind": "function", "doc": "Create output file based on config instructions
\n\nParameters \n\n\nconfig : The config object with output file options. \nconfig_fp : The config file path used in default naming options for the output file. \nremove : Overwrites the output file if it exists \noutput_file : The output file path if it exists \noutput_directory : The output directory used for default naming (specified in nir config) \nkwargs : Compatibility arguments \n \n\nReturns \n", "signature": "(config , config_fp , remove , output_file , output_directory , ** kwargs ): ", "funcdef": "def"}, {"fullname": "debeir.utils.utils.unpack_coroutine", "modulename": "debeir.utils.utils", "qualname": "unpack_coroutine", "kind": "function", "doc": "Recursively unwraps co-routines until a result is reached.
\n\nParameters \n\n\nf : Wrapped co-routine function. \n \n\nReturns \n\n\nResults from the (final) evaluated co-routine.\n
\n \n", "signature": "(f ): ", "funcdef": "async def"}, {"fullname": "debeir.utils.utils.flatten", "modulename": "debeir.utils.utils", "qualname": "flatten", "kind": "function", "doc": "Flattens a multidimensional dictionary (dictionary of dictionaries) to a single layer with child keys seperated by\n\"sep\"
\n\nParameters \n\n\nd : Multi-level dictionary to flatten. \nparent_key : Prepend a parent_key to all layers. \nsep : Seperator token between child and parent layers. \n \n\nReturns \n\n\nA flattened 1-D dictionary with keys seperated by *sep*.\n
\n \n", "signature": "(d , parent_key = '' , sep = '_' ): ", "funcdef": "def"}, {"fullname": "debeir.utils.utils.remove_excess_whitespace", "modulename": "debeir.utils.utils", "qualname": "remove_excess_whitespace", "kind": "function", "doc": "
\n", "signature": "(s ): ", "funcdef": "def"}];
+ /** pdoc search index */const docs = [{"fullname": "debeir", "modulename": "debeir", "kind": "module", "doc": "The DeBEIR (Dense Bi-Encoder Information Retrieval) source code library.
\n\nSee ./examples/ in the parent directory for an out-of-the-box runnable code.
\n\nOtherwise, check out notebooks in the parent directory for training your own model amongst other things.
\n"}, {"fullname": "debeir.core", "modulename": "debeir.core", "kind": "module", "doc": "Core library interfaces that must be implemented for custom datasets
\n\nInterfaces to implement custom datasets in debeir.datasets.
\n"}, {"fullname": "debeir.core.callbacks", "modulename": "debeir.core.callbacks", "kind": "module", "doc": "Callbacks for before after running.\nE.g. before is for setup\nafter is for evaluation/serialization etc
\n"}, {"fullname": "debeir.core.callbacks.Callback", "modulename": "debeir.core.callbacks", "qualname": "Callback", "kind": "class", "doc": "
\n"}, {"fullname": "debeir.core.callbacks.Callback.__init__", "modulename": "debeir.core.callbacks", "qualname": "Callback.__init__", "kind": "function", "doc": "
\n", "signature": "() "}, {"fullname": "debeir.core.callbacks.Callback.before", "modulename": "debeir.core.callbacks", "qualname": "Callback.before", "kind": "function", "doc": "
\n", "signature": "(self , pipeline : debeir . core . pipeline . Pipeline ): ", "funcdef": "def"}, {"fullname": "debeir.core.callbacks.Callback.after", "modulename": "debeir.core.callbacks", "qualname": "Callback.after", "kind": "function", "doc": "
\n", "signature": "(self , results : List ): ", "funcdef": "def"}, {"fullname": "debeir.core.callbacks.SerializationCallback", "modulename": "debeir.core.callbacks", "qualname": "SerializationCallback", "kind": "class", "doc": "
\n", "bases": "Callback"}, {"fullname": "debeir.core.callbacks.SerializationCallback.__init__", "modulename": "debeir.core.callbacks", "qualname": "SerializationCallback.__init__", "kind": "function", "doc": "
\n", "signature": "(\tconfig : debeir . core . config . GenericConfig , \tnir_config : debeir . core . config . NIRConfig ) "}, {"fullname": "debeir.core.callbacks.SerializationCallback.before", "modulename": "debeir.core.callbacks", "qualname": "SerializationCallback.before", "kind": "function", "doc": "Check if output file exists
\n\nReturns \n\n\nOutput file path\n
\n \n", "signature": "(self , pipeline : debeir . core . pipeline . Pipeline ): ", "funcdef": "def"}, {"fullname": "debeir.core.callbacks.SerializationCallback.after", "modulename": "debeir.core.callbacks", "qualname": "SerializationCallback.after", "kind": "function", "doc": "Serialize results to self.output_file in a TREC-style format
\n\nParameters \n\n\ntopic_num : Topic number to serialize \nres : Raw elasticsearch result \nrun_name: The run name for TREC-style runs (default : NO_RUN_NAME) \n \n", "signature": "(self , results : List ): ", "funcdef": "def"}, {"fullname": "debeir.core.callbacks.EvaluationCallback", "modulename": "debeir.core.callbacks", "qualname": "EvaluationCallback", "kind": "class", "doc": "
\n", "bases": "Callback"}, {"fullname": "debeir.core.callbacks.EvaluationCallback.__init__", "modulename": "debeir.core.callbacks", "qualname": "EvaluationCallback.__init__", "kind": "function", "doc": "
\n", "signature": "(evaluator : debeir . evaluation . evaluator . Evaluator , config ) "}, {"fullname": "debeir.core.callbacks.EvaluationCallback.before", "modulename": "debeir.core.callbacks", "qualname": "EvaluationCallback.before", "kind": "function", "doc": "
\n", "signature": "(self , pipeline : debeir . core . pipeline . Pipeline ): ", "funcdef": "def"}, {"fullname": "debeir.core.callbacks.EvaluationCallback.after", "modulename": "debeir.core.callbacks", "qualname": "EvaluationCallback.after", "kind": "function", "doc": "
\n", "signature": "(self , results : List , id_field = 'id' ): ", "funcdef": "def"}, {"fullname": "debeir.core.config", "modulename": "debeir.core.config", "kind": "module", "doc": "
\n"}, {"fullname": "debeir.core.config.Config", "modulename": "debeir.core.config", "qualname": "Config", "kind": "class", "doc": "Config Interface with creation class methods
\n"}, {"fullname": "debeir.core.config.Config.__init__", "modulename": "debeir.core.config", "qualname": "Config.__init__", "kind": "function", "doc": "
\n", "signature": "() "}, {"fullname": "debeir.core.config.Config.from_toml", "modulename": "debeir.core.config", "qualname": "Config.from_toml", "kind": "function", "doc": "Instantiates a Config object from a toml file
\n\nParameters \n\n\nfp : File path of the Config TOML file \nfield_class : Class of the Config object to be instantiated \nargs : Arguments to be passed to Config \nkwargs : Keyword arguments to be passed \n \n\nReturns \n\n\nA instantiated and validated Config object.\n
\n \n", "signature": "(\tcls , \tfp : Union [ str , pathlib . Path ] , \tfield_class , \t* args , \t** kwargs ) -> debeir . core . config . Config : ", "funcdef": "def"}, {"fullname": "debeir.core.config.Config.from_args", "modulename": "debeir.core.config", "qualname": "Config.from_args", "kind": "function", "doc": "Instantiates a Config object from arguments
\n\nParameters \n\n\nargs_dict : \nfield_class : \nargs : \nkwargs : \n \n\nReturns \n", "signature": "(cls , args_dict : MutableMapping , field_class , * args , ** kwargs ): ", "funcdef": "def"}, {"fullname": "debeir.core.config.Config.from_dict", "modulename": "debeir.core.config", "qualname": "Config.from_dict", "kind": "function", "doc": "Instantiates a Config object from a dictionary
\n\nParameters \n\n\ndata_class : \nkwargs : \n \n\nReturns \n", "signature": "(cls , data_class , ** kwargs ): ", "funcdef": "def"}, {"fullname": "debeir.core.config.Config.validate", "modulename": "debeir.core.config", "qualname": "Config.validate", "kind": "function", "doc": "Validates if the config is correct.\nMust be implemented by inherited classes.
\n", "signature": "(self ): ", "funcdef": "def"}, {"fullname": "debeir.core.config.GenericConfig", "modulename": "debeir.core.config", "qualname": "GenericConfig", "kind": "class", "doc": "Generic NIR Configuration file for which all configs will inherit
\n", "bases": "Config, abc.ABC"}, {"fullname": "debeir.core.config.GenericConfig.__init__", "modulename": "debeir.core.config", "qualname": "GenericConfig.__init__", "kind": "function", "doc": "
\n", "signature": "(\tquery_type : str , \tindex : str = None , \tencoder_normalize : bool = True , \tablations : bool = False , \tnorm_weight : float = None , \tautomatic : bool = None , \tencoder : object = None , \tencoder_fp : str = None , \tquery_weights : List [ float ] = None , \tcosine_weights : List [ float ] = None , \tevaluate : bool = False , \tqrels : str = None , \tconfig_fn : str = None , \tquery_fn : str = None , \tparser_fn : str = None , \texecutor_fn : str = None , \tcosine_ceiling : float = None , \ttopics_path : str = None , \treturn_id_only : bool = False , \toverwrite_output_if_exists : bool = False , \toutput_file : str = None , \trun_name : str = None ) "}, {"fullname": "debeir.core.config.GenericConfig.from_toml", "modulename": "debeir.core.config", "qualname": "GenericConfig.from_toml", "kind": "function", "doc": "Instantiates a Config object from a toml file
\n\nParameters \n\n\nfp : File path of the Config TOML file \nfield_class : Class of the Config object to be instantiated \nargs : Arguments to be passed to Config \nkwargs : Keyword arguments to be passed \n \n\nReturns \n\n\nA instantiated and validated Config object.\n
\n \n", "signature": "(\tcls , \tfp : Union [ str , pathlib . Path ] , \t* args , \t** kwargs ) -> debeir . core . config . GenericConfig : ", "funcdef": "def"}, {"fullname": "debeir.core.config.ElasticsearchConfig", "modulename": "debeir.core.config", "qualname": "ElasticsearchConfig", "kind": "class", "doc": "Basic Elasticsearch configuration file settings from the master nir.toml file
\n", "bases": "Config"}, {"fullname": "debeir.core.config.ElasticsearchConfig.__init__", "modulename": "debeir.core.config", "qualname": "ElasticsearchConfig.__init__", "kind": "function", "doc": "
\n", "signature": "(protocol : str , ip : str , port : str , timeout : int ) "}, {"fullname": "debeir.core.config.ElasticsearchConfig.validate", "modulename": "debeir.core.config", "qualname": "ElasticsearchConfig.validate", "kind": "function", "doc": "Checks if Elasticsearch URL is correct
\n", "signature": "(self ): ", "funcdef": "def"}, {"fullname": "debeir.core.config.SolrConfig", "modulename": "debeir.core.config", "qualname": "SolrConfig", "kind": "class", "doc": "Basic Solr configuration file settings from the master nir.toml file
\n", "bases": "ElasticsearchConfig"}, {"fullname": "debeir.core.config.SolrConfig.__init__", "modulename": "debeir.core.config", "qualname": "SolrConfig.__init__", "kind": "function", "doc": "
\n", "signature": "(protocol : str , ip : str , port : str , timeout : int ) "}, {"fullname": "debeir.core.config.MetricsConfig", "modulename": "debeir.core.config", "qualname": "MetricsConfig", "kind": "class", "doc": "Basic Metrics configuration file settings from the master nir.toml file
\n", "bases": "Config"}, {"fullname": "debeir.core.config.MetricsConfig.__init__", "modulename": "debeir.core.config", "qualname": "MetricsConfig.__init__", "kind": "function", "doc": "
\n", "signature": "(metrics : List [ str ] ) "}, {"fullname": "debeir.core.config.MetricsConfig.validate", "modulename": "debeir.core.config", "qualname": "MetricsConfig.validate", "kind": "function", "doc": "Checks if each Metrics is usable by evaluator classes
\n", "signature": "(self ): ", "funcdef": "def"}, {"fullname": "debeir.core.config.NIRConfig", "modulename": "debeir.core.config", "qualname": "NIRConfig", "kind": "class", "doc": "Basic NIR configuration file settings from the master nir.toml file
\n", "bases": "Config"}, {"fullname": "debeir.core.config.NIRConfig.__init__", "modulename": "debeir.core.config", "qualname": "NIRConfig.__init__", "kind": "function", "doc": "
\n", "signature": "(\tnorm_weight : str , \tevaluate : bool , \treturn_size : int , \toutput_directory : str ) "}, {"fullname": "debeir.core.config.NIRConfig.validate", "modulename": "debeir.core.config", "qualname": "NIRConfig.validate", "kind": "function", "doc": "Validates if the config is correct.\nMust be implemented by inherited classes.
\n", "signature": "(self ): ", "funcdef": "def"}, {"fullname": "debeir.core.config.apply_config", "modulename": "debeir.core.config", "qualname": "apply_config", "kind": "function", "doc": "Configuration decorator.
\n\nParameters \n\n\nfunc : Decorated function \n \n\nReturns \n", "signature": "(func ): ", "funcdef": "def"}, {"fullname": "debeir.core.config.override_with_toml_config", "modulename": "debeir.core.config", "qualname": "override_with_toml_config", "kind": "function", "doc": "Configuration decorator. Overwrite a functions kwargs and args with a specified toml config file.\nPass override_with_config=path/to/config
\n\nParameters \n\n\nfunc : Decorated function \n \n\nReturns \n", "signature": "(func ): ", "funcdef": "def"}, {"fullname": "debeir.core.config.save_kwargs_to_file", "modulename": "debeir.core.config", "qualname": "save_kwargs_to_file", "kind": "function", "doc": "
\n", "signature": "(func ): ", "funcdef": "def"}, {"fullname": "debeir.core.converters", "modulename": "debeir.core.converters", "kind": "module", "doc": "
\n"}, {"fullname": "debeir.core.converters.ParsedTopicsToDataset", "modulename": "debeir.core.converters", "qualname": "ParsedTopicsToDataset", "kind": "class", "doc": "Converts a parser's output to a huggingface dataset object.
\n"}, {"fullname": "debeir.core.converters.ParsedTopicsToDataset.__init__", "modulename": "debeir.core.converters", "qualname": "ParsedTopicsToDataset.__init__", "kind": "function", "doc": "
\n", "signature": "() "}, {"fullname": "debeir.core.converters.ParsedTopicsToDataset.convert", "modulename": "debeir.core.converters", "qualname": "ParsedTopicsToDataset.convert", "kind": "function", "doc": "Flatten a Dict of shape (traditional parser output)\n{topic_id: {\n \"Facet_1\": ...\n \"Facet_2\": ...\n }\n}
\n\n->
\n\nTo a flattened arrow-like dataset.\n{\ntopic_ids: [],\nFacet_1s: [],\nFacet_2s: [],\n}
\n\nParameters \n\n\noutput : Topics output from the parser object \n \n\nReturns \n", "signature": "(\tcls , \tparser : debeir . core . parser . Parser , \toutput : Dict [ Union [ str , int ], Dict ] ): ", "funcdef": "def"}, {"fullname": "debeir.core.document", "modulename": "debeir.core.document", "kind": "module", "doc": "
\n"}, {"fullname": "debeir.core.document.Document", "modulename": "debeir.core.document", "qualname": "Document", "kind": "class", "doc": "Generic Document class.\nUsed as an interface for interacting across multiple indexes with different mappings.
\n"}, {"fullname": "debeir.core.document.Document.__init__", "modulename": "debeir.core.document", "qualname": "Document.__init__", "kind": "function", "doc": "
\n", "signature": "(\tdoc_id : Union [ int , float , str ] , \ttopic_num : Union [ int , str , float ] = None , \tfacets : Dict = None , \tscore : Union [ float , int ] = 0.0 , \tscores : Dict [ str , Union [ float , int ]] = < factory > ) "}, {"fullname": "debeir.core.document.Document.from_results", "modulename": "debeir.core.document", "qualname": "Document.from_results", "kind": "function", "doc": "Produces a list of Document objects from raw results returned from the index
\n\nIn the format {topic_num: [Document, ..., Document]}
\n", "signature": "(\tcls , \tresults , \t* args , \t** kwargs ) -> Dict [ Union [ int , float ], debeir . core . document . Document ] : ", "funcdef": "def"}, {"fullname": "debeir.core.document.Document.get_document_id", "modulename": "debeir.core.document", "qualname": "Document.get_document_id", "kind": "function", "doc": "Returns \n\n\nself.doc_id\n
\n \n", "signature": "(self ): ", "funcdef": "def"}, {"fullname": "debeir.core.document.Document.flatten_facets", "modulename": "debeir.core.document", "qualname": "Document.flatten_facets", "kind": "function", "doc": "Flattens multi-level internal document facets into a single level\n e.g. Doc['Upper']['Lower'] -> Doc['Upper_Lower']
\n\nParameters \n\n\n", "signature": "(self , * args , ** kwargs ): ", "funcdef": "def"}, {"fullname": "debeir.core.document.Document.get_document_facet", "modulename": "debeir.core.document", "qualname": "Document.get_document_facet", "kind": "function", "doc": "Retrieve a document facet\nWorks for multidimensional keys or single
\n\nParameters \n\n\nkey : Facet to retrieve \nsep : The seperator for multidimensional key \n \n\nReturns \n\n\nReturns the document facet given the key (field)\n
\n \n", "signature": "(self , key , sep = '_' ): ", "funcdef": "def"}, {"fullname": "debeir.core.document.Document.set", "modulename": "debeir.core.document", "qualname": "Document.set", "kind": "function", "doc": "Set attributes of the object. Use keyword arguments to do so. Works as a builder class.\ndoc.set(doc_id=\"123\").set(facets={\"title\": \"my title\"})
\n\nParameters \n\n\ndoc_id : \nfacets : \nscore : \nfacet : \nfacet_value : \n \n\nReturns \n\n\nReturns document object\n
\n \n", "signature": "(\tself , \tdoc_id = None , \tfacets = None , \tscore = None , \tfacet = None , \tfacet_value = None ) -> debeir . core . document . Document : ", "funcdef": "def"}, {"fullname": "debeir.core.document.Document.to_trec_format", "modulename": "debeir.core.document", "qualname": "Document.to_trec_format", "kind": "function", "doc": "Returns TREC format for the document
\n\nReturns \n\n\nA trec formatted string\n
\n \n", "signature": "(self , rank , run_name ) -> str : ", "funcdef": "def"}, {"fullname": "debeir.core.document.Document.get_trec_format", "modulename": "debeir.core.document", "qualname": "Document.get_trec_format", "kind": "function", "doc": "Get the trec format of a list of ranked documents. This function is a generator.
\n\nParameters \n\n\nranked_list : A list of Document-type objects \nrun_name : Run name to print in the TREC formatted string \nsort : Whether to sort the input list in descending order of score. \nsorting_func : Custom sorting function will be used if provided \n \n", "signature": "(\tcls , \tranked_list : List [ debeir . core . document . Document ] , \trun_name = 'NO_RUN_NAME' , \tsort = True , \tsorting_func = None ): ", "funcdef": "def"}, {"fullname": "debeir.core.document.ElasticsearchDocument", "modulename": "debeir.core.document", "qualname": "ElasticsearchDocument", "kind": "class", "doc": "Generic Document class.\nUsed as an interface for interacting across multiple indexes with different mappings.
\n", "bases": "Document"}, {"fullname": "debeir.core.document.ElasticsearchDocument.from_results", "modulename": "debeir.core.document", "qualname": "ElasticsearchDocument.from_results", "kind": "function", "doc": "Produces a list of Document objects from raw results returned from the index
\n\nIn the format {topic_num: [Document, ..., Document]}
\n", "signature": "(\tcls , \tresults , \tquery_cls , \tignore_facets = True , \t* args , \t** kwargs ) -> Dict [ Union [ int , float ], debeir . core . document . Document ] : ", "funcdef": "def"}, {"fullname": "debeir.core.executor", "modulename": "debeir.core.executor", "kind": "module", "doc": "
\n"}, {"fullname": "debeir.core.executor.GenericElasticsearchExecutor", "modulename": "debeir.core.executor", "qualname": "GenericElasticsearchExecutor", "kind": "class", "doc": "Generic Executor class for Elasticsearch
\n", "bases": "debeir.engines.elasticsearch.executor.ElasticsearchExecutor"}, {"fullname": "debeir.core.executor.GenericElasticsearchExecutor.__init__", "modulename": "debeir.core.executor", "qualname": "GenericElasticsearchExecutor.__init__", "kind": "function", "doc": "
\n", "signature": "(\ttopics : Dict [ Union [ str , int ], Dict [ str , str ]] , \tclient : elasticsearch . AsyncElasticsearch , \tindex_name : str , \toutput_file : str , \tquery : debeir . core . query . GenericElasticsearchQuery , \tencoder : Optional [ debeir . rankers . transformer_sent_encoder . Encoder ] = None , \tconfig = None , \t* args , \t** kwargs ) "}, {"fullname": "debeir.core.executor.GenericElasticsearchExecutor.generate_query", "modulename": "debeir.core.executor", "qualname": "GenericElasticsearchExecutor.generate_query", "kind": "function", "doc": "Generates a standard BM25 query given the topic number
\n\nParameters \n\n\ntopic_num : Query topic number to generate \nbest_fields : Whether to use a curated list of fields \nkwargs : \n \n\nReturns \n", "signature": "(self , topic_num , best_fields = True , ** kwargs ): ", "funcdef": "def"}, {"fullname": "debeir.core.executor.GenericElasticsearchExecutor.generate_embedding_query", "modulename": "debeir.core.executor", "qualname": "GenericElasticsearchExecutor.generate_embedding_query", "kind": "function", "doc": "Executes an NIR-style query with combined scoring.
\n\nParameters \n\n\ntopic_num : \ncosine_weights : \nquery_weights : \nnorm_weight : \nautomatic_scores : \nkwargs : \n \n\nReturns \n", "signature": "(\tself , \ttopic_num , \tcosine_weights = None , \tquery_weights = None , \tnorm_weight = 2.15 , \tautomatic_scores = None , \t** kwargs ): ", "funcdef": "def"}, {"fullname": "debeir.core.executor.GenericElasticsearchExecutor.execute_query", "modulename": "debeir.core.executor", "qualname": "GenericElasticsearchExecutor.execute_query", "kind": "function", "doc": "Execute a query given parameters
\n\nParameters \n\n\n", "signature": "(\tself , \tquery = None , \treturn_size : int = None , \treturn_id_only : bool = None , \ttopic_num = None , \tablation = False , \tquery_type = None , \t** kwargs ): ", "funcdef": "async def"}, {"fullname": "debeir.core.executor.GenericElasticsearchExecutor.run_automatic_adjustment", "modulename": "debeir.core.executor", "qualname": "GenericElasticsearchExecutor.run_automatic_adjustment", "kind": "function", "doc": "Get the normalization constant to be used in NIR-style queries for all topics given an initial\nrun of BM25 results.
\n", "signature": "(self , return_results = False ): ", "funcdef": "async def"}, {"fullname": "debeir.core.executor.GenericElasticsearchExecutor.build_from_config", "modulename": "debeir.core.executor", "qualname": "GenericElasticsearchExecutor.build_from_config", "kind": "function", "doc": "Build an query executor engine from a config file.
\n", "signature": "(\tcls , \ttopics : Dict , \tquery_obj : debeir . core . query . GenericElasticsearchQuery , \tclient , \tconfig : debeir . core . config . GenericConfig , \tnir_config : debeir . core . config . NIRConfig ): ", "funcdef": "def"}, {"fullname": "debeir.core.indexer", "modulename": "debeir.core.indexer", "kind": "module", "doc": "
\n"}, {"fullname": "debeir.core.indexer.Indexer", "modulename": "debeir.core.indexer", "qualname": "Indexer", "kind": "class", "doc": "
\n"}, {"fullname": "debeir.core.indexer.Indexer.__init__", "modulename": "debeir.core.indexer", "qualname": "Indexer.__init__", "kind": "function", "doc": "
\n", "signature": "(client ) "}, {"fullname": "debeir.core.indexer.Indexer.get_field", "modulename": "debeir.core.indexer", "qualname": "Indexer.get_field", "kind": "function", "doc": "
\n", "signature": "(self , document , field ): ", "funcdef": "def"}, {"fullname": "debeir.core.indexer.SemanticElasticsearchIndexer", "modulename": "debeir.core.indexer", "qualname": "SemanticElasticsearchIndexer", "kind": "class", "doc": "Create a NIR-style index, with dense field representations with provided sentence encoder\nAssumes you've already indexed to start with.
\n", "bases": "Indexer, threading.Thread"}, {"fullname": "debeir.core.indexer.SemanticElasticsearchIndexer.__init__", "modulename": "debeir.core.indexer", "qualname": "SemanticElasticsearchIndexer.__init__", "kind": "function", "doc": "This constructor should always be called with keyword arguments. Arguments are:
\n\ngroup should be None; reserved for future extension when a ThreadGroup\nclass is implemented.
\n\ntarget is the callable object to be invoked by the run()\nmethod. Defaults to None, meaning nothing is called.
\n\nname is the thread name. By default, a unique name is constructed of\nthe form \"Thread-N\" where N is a small decimal number.
\n\nargs is the argument tuple for the target invocation. Defaults to ().
\n\nkwargs is a dictionary of keyword arguments for the target\ninvocation. Defaults to {}.
\n\nIf a subclass overrides the constructor, it must make sure to invoke\nthe base class constructor (Thread.__init__()) before doing anything\nelse to the thread.
\n", "signature": "(\tes_client : elasticsearch . Elasticsearch , \tencoder : debeir . rankers . transformer_sent_encoder . Encoder , \tindex : str , \tfields_to_encode : List [ str ] , \tqueue : queue . Queue ) "}, {"fullname": "debeir.core.indexer.SemanticElasticsearchIndexer.update_mappings", "modulename": "debeir.core.indexer", "qualname": "SemanticElasticsearchIndexer.update_mappings", "kind": "function", "doc": "
\n", "signature": "(self , index , fields , client : elasticsearch . Elasticsearch ): ", "funcdef": "def"}, {"fullname": "debeir.core.indexer.SemanticElasticsearchIndexer.get_field", "modulename": "debeir.core.indexer", "qualname": "SemanticElasticsearchIndexer.get_field", "kind": "function", "doc": "
\n", "signature": "(self , document , field ): ", "funcdef": "def"}, {"fullname": "debeir.core.indexer.SemanticElasticsearchIndexer.index_document", "modulename": "debeir.core.indexer", "qualname": "SemanticElasticsearchIndexer.index_document", "kind": "function", "doc": "
\n", "signature": "(self , document ): ", "funcdef": "def"}, {"fullname": "debeir.core.indexer.SemanticElasticsearchIndexer.run", "modulename": "debeir.core.indexer", "qualname": "SemanticElasticsearchIndexer.run", "kind": "function", "doc": "Method representing the thread's activity.
\n\nYou may override this method in a subclass. The standard run() method\ninvokes the callable object passed to the object's constructor as the\ntarget argument, if any, with sequential and keyword arguments taken\nfrom the args and kwargs arguments, respectively.
\n", "signature": "(self ): ", "funcdef": "def"}, {"fullname": "debeir.core.parser", "modulename": "debeir.core.parser", "kind": "module", "doc": "
\n"}, {"fullname": "debeir.core.parser.Parser", "modulename": "debeir.core.parser", "qualname": "Parser", "kind": "class", "doc": "Parser interface
\n"}, {"fullname": "debeir.core.parser.Parser.__init__", "modulename": "debeir.core.parser", "qualname": "Parser.__init__", "kind": "function", "doc": "
\n", "signature": "(id_field : object , parse_fields : List [ str ] ) "}, {"fullname": "debeir.core.parser.Parser.normalize", "modulename": "debeir.core.parser", "qualname": "Parser.normalize", "kind": "function", "doc": "Flatten the dictionary, i.e. from Dict[int, Dict] -> Dict[str, str_or_int]
\n\nParameters \n\n\n\nReturns \n", "signature": "(cls , input_dict ) -> Dict : ", "funcdef": "def"}, {"fullname": "debeir.core.parser.Parser.get_topics", "modulename": "debeir.core.parser", "qualname": "Parser.get_topics", "kind": "function", "doc": "Instance method for getting topics, forwards instance self parameters to the _get_topics class method.
\n", "signature": "(self , path , * args , ** kwargs ): ", "funcdef": "def"}, {"fullname": "debeir.core.parser.PickleParser", "modulename": "debeir.core.parser", "qualname": "PickleParser", "kind": "class", "doc": "Load topics from a pickle file
\n", "bases": "Parser"}, {"fullname": "debeir.core.parser.PickleParser.__init__", "modulename": "debeir.core.parser", "qualname": "PickleParser.__init__", "kind": "function", "doc": "
\n", "signature": "(id_field : object , parse_fields : List [ str ] ) "}, {"fullname": "debeir.core.parser.XMLParser", "modulename": "debeir.core.parser", "qualname": "XMLParser", "kind": "class", "doc": "Load topics from an XML file
\n", "bases": "Parser"}, {"fullname": "debeir.core.parser.XMLParser.__init__", "modulename": "debeir.core.parser", "qualname": "XMLParser.__init__", "kind": "function", "doc": "
\n", "signature": "(id_field : str , parse_fields : List [ str ] , topic_field_name : str ) "}, {"fullname": "debeir.core.parser.XMLParser.unwrap", "modulename": "debeir.core.parser", "qualname": "XMLParser.unwrap", "kind": "function", "doc": "Converts defaultdict to dict and list of size 1 to just the element
\n\nParameters \n\n\n", "signature": "(cls , doc_dict , key ): ", "funcdef": "def"}, {"fullname": "debeir.core.parser.CSVParser", "modulename": "debeir.core.parser", "qualname": "CSVParser", "kind": "class", "doc": "Loads topics from a CSV file
\n", "bases": "Parser"}, {"fullname": "debeir.core.parser.CSVParser.__init__", "modulename": "debeir.core.parser", "qualname": "CSVParser.__init__", "kind": "function", "doc": "
\n", "signature": "(id_field = None , parse_fields = None ) "}, {"fullname": "debeir.core.parser.TSVParser", "modulename": "debeir.core.parser", "qualname": "TSVParser", "kind": "class", "doc": "
\n", "bases": "CSVParser"}, {"fullname": "debeir.core.parser.TSVParser.__init__", "modulename": "debeir.core.parser", "qualname": "TSVParser.__init__", "kind": "function", "doc": "
\n", "signature": "(id_field : object , parse_fields : List [ str ] ) "}, {"fullname": "debeir.core.parser.JsonLinesParser", "modulename": "debeir.core.parser", "qualname": "JsonLinesParser", "kind": "class", "doc": "Loads topics from a jsonl file,\na JSON per line
\n\nProvide parse_fields, id_field and whether to ignore full matches on json keys\nsecondary_id appends to the primary id as jsonlines are flattened structure and may contain duplicate ids.
\n", "bases": "Parser"}, {"fullname": "debeir.core.parser.JsonLinesParser.__init__", "modulename": "debeir.core.parser", "qualname": "JsonLinesParser.__init__", "kind": "function", "doc": "
\n", "signature": "(\tid_field : str , \tparse_fields : List [ str ] , \tignore_full_match : bool = True , \tsecondary_id : str = None ) "}, {"fullname": "debeir.core.pipeline", "modulename": "debeir.core.pipeline", "kind": "module", "doc": "
\n"}, {"fullname": "debeir.core.pipeline.Pipeline", "modulename": "debeir.core.pipeline", "qualname": "Pipeline", "kind": "class", "doc": "
\n"}, {"fullname": "debeir.core.pipeline.Pipeline.__init__", "modulename": "debeir.core.pipeline", "qualname": "Pipeline.__init__", "kind": "function", "doc": "
\n", "signature": "(\tengine : debeir . core . executor . GenericElasticsearchExecutor , \tengine_name : str , \tmetrics_config , \tengine_config , \tnir_config , \trun_config : debeir . core . config . Config , \tcallbacks = None ) "}, {"fullname": "debeir.core.pipeline.Pipeline.disable", "modulename": "debeir.core.pipeline", "qualname": "Pipeline.disable", "kind": "function", "doc": "
\n", "signature": "(self , parts : list ): ", "funcdef": "def"}, {"fullname": "debeir.core.pipeline.Pipeline.build_from_config", "modulename": "debeir.core.pipeline", "qualname": "Pipeline.build_from_config", "kind": "function", "doc": "
\n", "signature": "(cls , nir_config_fp , engine , config_fp ) -> debeir . core . pipeline . Pipeline : ", "funcdef": "def"}, {"fullname": "debeir.core.pipeline.Pipeline.run_pipeline", "modulename": "debeir.core.pipeline", "qualname": "Pipeline.run_pipeline", "kind": "function", "doc": "
\n", "signature": "(self , * args , ** kwargs ): ", "funcdef": "async def"}, {"fullname": "debeir.core.pipeline.NIRPipeline", "modulename": "debeir.core.pipeline", "qualname": "NIRPipeline", "kind": "class", "doc": "
\n", "bases": "Pipeline"}, {"fullname": "debeir.core.pipeline.NIRPipeline.__init__", "modulename": "debeir.core.pipeline", "qualname": "NIRPipeline.__init__", "kind": "function", "doc": "
\n", "signature": "(* args , ** kwargs ) "}, {"fullname": "debeir.core.pipeline.NIRPipeline.prehook", "modulename": "debeir.core.pipeline", "qualname": "NIRPipeline.prehook", "kind": "function", "doc": "
\n", "signature": "(self ): ", "funcdef": "async def"}, {"fullname": "debeir.core.pipeline.NIRPipeline.run_engine", "modulename": "debeir.core.pipeline", "qualname": "NIRPipeline.run_engine", "kind": "function", "doc": "
\n", "signature": "(self , * args , ** kwargs ): ", "funcdef": "async def"}, {"fullname": "debeir.core.pipeline.NIRPipeline.posthook", "modulename": "debeir.core.pipeline", "qualname": "NIRPipeline.posthook", "kind": "function", "doc": "
\n", "signature": "(self , * args , ** kwargs ): ", "funcdef": "async def"}, {"fullname": "debeir.core.pipeline.NIRPipeline.run_pipeline", "modulename": "debeir.core.pipeline", "qualname": "NIRPipeline.run_pipeline", "kind": "function", "doc": "
\n", "signature": "(self , * args , return_results = False , ** kwargs ): ", "funcdef": "async def"}, {"fullname": "debeir.core.pipeline.NIRPipeline.register_callback", "modulename": "debeir.core.pipeline", "qualname": "NIRPipeline.register_callback", "kind": "function", "doc": "
\n", "signature": "(self , cb ): ", "funcdef": "def"}, {"fullname": "debeir.core.pipeline.BM25Pipeline", "modulename": "debeir.core.pipeline", "qualname": "BM25Pipeline", "kind": "class", "doc": "
\n", "bases": "NIRPipeline"}, {"fullname": "debeir.core.pipeline.BM25Pipeline.run_pipeline", "modulename": "debeir.core.pipeline", "qualname": "BM25Pipeline.run_pipeline", "kind": "function", "doc": "
\n", "signature": "(self , * args , return_results = False , ** kwargs ): ", "funcdef": "async def"}, {"fullname": "debeir.core.query", "modulename": "debeir.core.query", "kind": "module", "doc": "
\n"}, {"fullname": "debeir.core.query.Query", "modulename": "debeir.core.query", "qualname": "Query", "kind": "class", "doc": "A query interface class
\n\nParameters \n\n\ntopics : Topics that the query will be composed of \nconfig : Config object that contains the settings for querying \n \n"}, {"fullname": "debeir.core.query.Query.__init__", "modulename": "debeir.core.query", "qualname": "Query.__init__", "kind": "function", "doc": "
\n", "signature": "(\ttopics : Dict [ int , Dict [ str , str ]] , \tconfig : debeir . core . config . GenericConfig ) "}, {"fullname": "debeir.core.query.GenericElasticsearchQuery", "modulename": "debeir.core.query", "qualname": "GenericElasticsearchQuery", "kind": "class", "doc": "A generic elasticsearch query. Contains methods for NIR-style (embedding) queries and normal BM25 queries.\nRequires topics, configs to be included
\n", "bases": "Query"}, {"fullname": "debeir.core.query.GenericElasticsearchQuery.__init__", "modulename": "debeir.core.query", "qualname": "GenericElasticsearchQuery.__init__", "kind": "function", "doc": "
\n", "signature": "(\ttopics , \tconfig , \ttop_bm25_scores = None , \tmappings = None , \tid_mapping = None , \t* args , \t** kwargs ) "}, {"fullname": "debeir.core.query.GenericElasticsearchQuery.generate_query", "modulename": "debeir.core.query", "qualname": "GenericElasticsearchQuery.generate_query", "kind": "function", "doc": "Generates a simple BM25 query based off the query facets. Searches over all the document facets.
\n\nParameters \n\n\ntopic_num : \nargs : \nkwargs : \n \n\nReturns \n", "signature": "(self , topic_num , * args , ** kwargs ): ", "funcdef": "def"}, {"fullname": "debeir.core.query.GenericElasticsearchQuery.set_bm25_scores", "modulename": "debeir.core.query", "qualname": "GenericElasticsearchQuery.set_bm25_scores", "kind": "function", "doc": "Sets BM25 scores that are used for NIR-style scoring. The top BM25 score for each topic is used\nfor log normalization.
\n\nScore = log(bm25)/log(z) + embed_score
\n\nParameters \n\n\nscores: Top BM25 Scores of the form {topic_num : top_bm25_score} \n \n", "signature": "(self , scores : Dict [ Union [ str , int ], Union [ int , float ]] ): ", "funcdef": "def"}, {"fullname": "debeir.core.query.GenericElasticsearchQuery.has_bm25_scores", "modulename": "debeir.core.query", "qualname": "GenericElasticsearchQuery.has_bm25_scores", "kind": "function", "doc": "Checks if BM25 scores have been set
\n\nReturns \n", "signature": "(self ): ", "funcdef": "def"}, {"fullname": "debeir.core.query.GenericElasticsearchQuery.generate_query_embedding", "modulename": "debeir.core.query", "qualname": "GenericElasticsearchQuery.generate_query_embedding", "kind": "function", "doc": "Generates an embedding script score query for Elasticsearch as part of the NIR scoring function.
\n\nParameters \n\n\ntopic_num : The topic number to search for \nencoder : The encoder that will be used for encoding the topics \nnorm_weight : The BM25 log normalization constant \nablations : Whether to execute ablation style queries (i.e. one query facet\nor one document facet at a time) \ncosine_ceiling : Cosine ceiling used for automatic z-log normalization parameter calculation \nargs : \nkwargs : Pass disable_cache to disable encoder caching \n \n\nReturns \n\n\nAn elasticsearch script_score query\n
\n \n", "signature": "(self , * args , ** kwargs ): ", "funcdef": "def"}, {"fullname": "debeir.core.query.GenericElasticsearchQuery.get_id_mapping", "modulename": "debeir.core.query", "qualname": "GenericElasticsearchQuery.get_id_mapping", "kind": "function", "doc": "Get the document ID
\n\nParameters \n\n\nhit : The raw document result \n \n\nReturns \n\n\nThe document's ID\n
\n \n", "signature": "(cls , hit ): ", "funcdef": "def"}, {"fullname": "debeir.core.results", "modulename": "debeir.core.results", "kind": "module", "doc": "
\n"}, {"fullname": "debeir.core.results.Results", "modulename": "debeir.core.results", "qualname": "Results", "kind": "class", "doc": "
\n"}, {"fullname": "debeir.core.results.Results.__init__", "modulename": "debeir.core.results", "qualname": "Results.__init__", "kind": "function", "doc": "
\n", "signature": "(results : List , query_cls , engine_name ) "}, {"fullname": "debeir.core.results.Results.get_topic_ids", "modulename": "debeir.core.results", "qualname": "Results.get_topic_ids", "kind": "function", "doc": "
\n", "signature": "(self ): ", "funcdef": "def"}, {"fullname": "debeir.datasets", "modulename": "debeir.datasets", "kind": "module", "doc": "Contains data_sets implemented from nir.core
\n\n\nParser (For reading data from files into a Dict object) \nQuery object (Generating queries)\n\nThese query objects can be very lightweight containing only the mappings of the index. \n \n \n"}, {"fullname": "debeir.datasets.bioreddit", "modulename": "debeir.datasets.bioreddit", "kind": "module", "doc": "
\n"}, {"fullname": "debeir.datasets.bioreddit.BioRedditSubmissionParser", "modulename": "debeir.datasets.bioreddit", "qualname": "BioRedditSubmissionParser", "kind": "class", "doc": "Parser for the BioReddit Submission Dataset
\n", "bases": "debeir.core.parser.CSVParser"}, {"fullname": "debeir.datasets.bioreddit.BioRedditSubmissionParser.get_topics", "modulename": "debeir.datasets.bioreddit", "qualname": "BioRedditSubmissionParser.get_topics", "kind": "function", "doc": "Instance method for getting topics, forwards instance self parameters to the _get_topics class method.
\n", "signature": "(cls , csvfile ) -> Dict [ int , Dict [ str , str ]] : ", "funcdef": "def"}, {"fullname": "debeir.datasets.bioreddit.BioRedditCommentParser", "modulename": "debeir.datasets.bioreddit", "qualname": "BioRedditCommentParser", "kind": "class", "doc": "Parser for the BioReddit Comment Dataset
\n", "bases": "debeir.core.parser.CSVParser"}, {"fullname": "debeir.datasets.bioreddit.BioRedditCommentParser.get_topics", "modulename": "debeir.datasets.bioreddit", "qualname": "BioRedditCommentParser.get_topics", "kind": "function", "doc": "Instance method for getting topics, forwards instance self parameters to the _get_topics class method.
\n", "signature": "(cls , csvfile ) -> Dict [ str , Dict [ str , str ]] : ", "funcdef": "def"}, {"fullname": "debeir.datasets.bioreddit.BioRedditElasticsearchQuery", "modulename": "debeir.datasets.bioreddit", "qualname": "BioRedditElasticsearchQuery", "kind": "class", "doc": "Elasticsearch Query object for the BioReddit
\n", "bases": "debeir.core.query.GenericElasticsearchQuery"}, {"fullname": "debeir.datasets.bioreddit.BioRedditElasticsearchQuery.__init__", "modulename": "debeir.datasets.bioreddit", "qualname": "BioRedditElasticsearchQuery.__init__", "kind": "function", "doc": "
\n", "signature": "(topics , config , * args , ** kwargs ) "}, {"fullname": "debeir.datasets.clinical_trials", "modulename": "debeir.datasets.clinical_trials", "kind": "module", "doc": "
\n"}, {"fullname": "debeir.datasets.clinical_trials.TrialsQueryConfig", "modulename": "debeir.datasets.clinical_trials", "qualname": "TrialsQueryConfig", "kind": "class", "doc": "
\n", "bases": "debeir.core.config.GenericConfig"}, {"fullname": "debeir.datasets.clinical_trials.TrialsQueryConfig.__init__", "modulename": "debeir.datasets.clinical_trials", "qualname": "TrialsQueryConfig.__init__", "kind": "function", "doc": "
\n", "signature": "(\tquery_type : str , \tindex : str = None , \tencoder_normalize : bool = True , \tablations : bool = False , \tnorm_weight : float = None , \tautomatic : bool = None , \tencoder : object = None , \tencoder_fp : str = None , \tquery_weights : List [ float ] = None , \tcosine_weights : List [ float ] = None , \tevaluate : bool = False , \tqrels : str = None , \tconfig_fn : str = None , \tquery_fn : str = None , \tparser_fn : str = None , \texecutor_fn : str = None , \tcosine_ceiling : float = None , \ttopics_path : str = None , \treturn_id_only : bool = False , \toverwrite_output_if_exists : bool = False , \toutput_file : str = None , \trun_name : str = None , \tquery_field_usage : str = None , \tembed_field_usage : str = None , \tfields : List [ str ] = None ) "}, {"fullname": "debeir.datasets.clinical_trials.TrialsQueryConfig.validate", "modulename": "debeir.datasets.clinical_trials", "qualname": "TrialsQueryConfig.validate", "kind": "function", "doc": "Checks if query type is included, and checks if an encoder is included for embedding queries
\n", "signature": "(self ): ", "funcdef": "def"}, {"fullname": "debeir.datasets.clinical_trials.TrialsQueryConfig.from_toml", "modulename": "debeir.datasets.clinical_trials", "qualname": "TrialsQueryConfig.from_toml", "kind": "function", "doc": "Instantiates a Config object from a toml file
\n\nParameters \n\n\nfp : File path of the Config TOML file \nfield_class : Class of the Config object to be instantiated \nargs : Arguments to be passed to Config \nkwargs : Keyword arguments to be passed \n \n\nReturns \n\n\nA instantiated and validated Config object.\n
\n \n", "signature": "(cls , fp : str , * args , ** kwargs ) -> debeir . core . config . GenericConfig : ", "funcdef": "def"}, {"fullname": "debeir.datasets.clinical_trials.TrialsQueryConfig.from_dict", "modulename": "debeir.datasets.clinical_trials", "qualname": "TrialsQueryConfig.from_dict", "kind": "function", "doc": "Instantiates a Config object from a dictionary
\n\nParameters \n\n\ndata_class : \nkwargs : \n \n\nReturns \n", "signature": "(cls , ** kwargs ) -> debeir . core . config . GenericConfig : ", "funcdef": "def"}, {"fullname": "debeir.datasets.clinical_trials.TrialsElasticsearchQuery", "modulename": "debeir.datasets.clinical_trials", "qualname": "TrialsElasticsearchQuery", "kind": "class", "doc": "Elasticsearch Query object for the Clinical Trials Index
\n", "bases": "debeir.core.query.GenericElasticsearchQuery"}, {"fullname": "debeir.datasets.clinical_trials.TrialsElasticsearchQuery.__init__", "modulename": "debeir.datasets.clinical_trials", "qualname": "TrialsElasticsearchQuery.__init__", "kind": "function", "doc": "
\n", "signature": "(topics , query_type , config = None , * args , ** kwargs ) "}, {"fullname": "debeir.datasets.clinical_trials.TrialsElasticsearchQuery.generate_query", "modulename": "debeir.datasets.clinical_trials", "qualname": "TrialsElasticsearchQuery.generate_query", "kind": "function", "doc": "Generates a query for the clinical trials index
\n\nParameters \n\n\ntopic_num : Topic number to search \nquery_field_usage : Which document facets to search over \nkwargs : \n \n\nReturns \n\n\nA basic elasticsearch query for clinical trials\n
\n \n", "signature": "(self , * args , ** kwargs ): ", "funcdef": "def"}, {"fullname": "debeir.datasets.clinical_trials.TrialsElasticsearchQuery.generate_query_ablation", "modulename": "debeir.datasets.clinical_trials", "qualname": "TrialsElasticsearchQuery.generate_query_ablation", "kind": "function", "doc": "Only search one document facet at a time
\n\nParameters \n\n\ntopic_num : \nkwargs : \n \n\nReturns \n", "signature": "(self , topic_num , ** kwargs ): ", "funcdef": "def"}, {"fullname": "debeir.datasets.clinical_trials.TrialsElasticsearchQuery.generate_query_embedding", "modulename": "debeir.datasets.clinical_trials", "qualname": "TrialsElasticsearchQuery.generate_query_embedding", "kind": "function", "doc": "Computes the NIR score for a given topic
\n\nScore = log(BM25)/log(norm_weight) + embedding_score
\n\nParameters \n\n\ntopic_num : \nencoder : \nquery_field_usage : \nembed_field_usage : \ncosine_weights : \nquery_weight : \nnorm_weight : \nablations : \nautomatic_scores : \nkwargs : \n \n\nReturns \n", "signature": "(self , * args , ** kwargs ): ", "funcdef": "def"}, {"fullname": "debeir.datasets.clinical_trials.TrialsElasticsearchQuery.get_query_type", "modulename": "debeir.datasets.clinical_trials", "qualname": "TrialsElasticsearchQuery.get_query_type", "kind": "function", "doc": "
\n", "signature": "(self , * args , ** kwargs ): ", "funcdef": "def"}, {"fullname": "debeir.datasets.clinical_trials.TrialsElasticsearchQuery.get_id_mapping", "modulename": "debeir.datasets.clinical_trials", "qualname": "TrialsElasticsearchQuery.get_id_mapping", "kind": "function", "doc": "Get the document ID
\n\nParameters \n\n\nhit : The raw document result \n \n\nReturns \n\n\nThe document's ID\n
\n \n", "signature": "(self , hit ): ", "funcdef": "def"}, {"fullname": "debeir.datasets.clinical_trials.ClinicalTrialsElasticsearchExecutor", "modulename": "debeir.datasets.clinical_trials", "qualname": "ClinicalTrialsElasticsearchExecutor", "kind": "class", "doc": "Executes queries given a query object.
\n", "bases": "debeir.core.executor.GenericElasticsearchExecutor"}, {"fullname": "debeir.datasets.clinical_trials.ClinicalTrialsElasticsearchExecutor.__init__", "modulename": "debeir.datasets.clinical_trials", "qualname": "ClinicalTrialsElasticsearchExecutor.__init__", "kind": "function", "doc": "
\n", "signature": "(\ttopics : Dict [ Union [ str , int ], Dict [ str , str ]] , \tclient : elasticsearch . AsyncElasticsearch , \tindex_name : str , \toutput_file : str , \tquery : debeir . datasets . clinical_trials . TrialsElasticsearchQuery , \tencoder : Optional [ debeir . rankers . transformer_sent_encoder . Encoder ] = None , \tconfig = None , \t* args , \t** kwargs ) "}, {"fullname": "debeir.datasets.clinical_trials.ClinicalTrialParser", "modulename": "debeir.datasets.clinical_trials", "qualname": "ClinicalTrialParser", "kind": "class", "doc": "Parser for Clinical Trials topics
\n", "bases": "debeir.core.parser.Parser"}, {"fullname": "debeir.datasets.clinical_trials.ClinicalTrialParser.get_topics", "modulename": "debeir.datasets.clinical_trials", "qualname": "ClinicalTrialParser.get_topics", "kind": "function", "doc": "Instance method for getting topics, forwards instance self parameters to the _get_topics class method.
\n", "signature": "(cls , csvfile ) -> Dict [ int , Dict [ str , str ]] : ", "funcdef": "def"}, {"fullname": "debeir.datasets.factory", "modulename": "debeir.datasets.factory", "kind": "module", "doc": "
\n"}, {"fullname": "debeir.datasets.factory.get_index_name", "modulename": "debeir.datasets.factory", "qualname": "get_index_name", "kind": "function", "doc": "Get the index name from the config without parsing as a TOML
\n\nParameters \n\n\n\nReturns \n", "signature": "(config_fp ): ", "funcdef": "def"}, {"fullname": "debeir.datasets.factory.factory_fn", "modulename": "debeir.datasets.factory", "qualname": "factory_fn", "kind": "function", "doc": "Factory method for creating the parsed topics, config object, query object and query executor object
\n\nParameters \n\n\nconfig_fp : Config file path \nindex : Index to search \n \n\nReturns \n\n\nQuery, Config, Parser, Executor, Evaluator\n
\n \n", "signature": "(\tconfig_fp , \tindex = None ) -> (<class 'debeir.core.query.Query'>, <class 'debeir.core.config.GenericConfig'>, <class 'debeir.core.parser.Parser'>, <class 'debeir.core.executor.GenericElasticsearchExecutor'>, <class 'debeir.evaluation.evaluator.Evaluator'>): ", "funcdef": "def"}, {"fullname": "debeir.datasets.factory.config_factory", "modulename": "debeir.datasets.factory", "qualname": "config_factory", "kind": "function", "doc": "Factory method for creating configs
\n\nParameters \n\n\npath : Config path \nconfig_cls : Config class to instantiate \nargs_dict : Arguments to consider \n \n\nReturns \n\n\nA config object\n
\n \n", "signature": "(\tpath : Union [ str , pathlib . Path ] = None , \tconfig_cls : Type [ debeir . core . config . Config ] = None , \targs_dict : Dict = None ): ", "funcdef": "def"}, {"fullname": "debeir.datasets.factory.get_nir_config", "modulename": "debeir.datasets.factory", "qualname": "get_nir_config", "kind": "function", "doc": "
\n", "signature": "(nir_config , * args , ignore_errors = False , ** kwargs ): ", "funcdef": "def"}, {"fullname": "debeir.datasets.factory.apply_nir_config", "modulename": "debeir.datasets.factory", "qualname": "apply_nir_config", "kind": "function", "doc": "Decorator that applies the NIR config settings to the current function\nReplaces arguments and keywords arguments with those found in the config
\n\nParameters \n\n\n\nReturns \n", "signature": "(func ): ", "funcdef": "def"}, {"fullname": "debeir.datasets.marco", "modulename": "debeir.datasets.marco", "kind": "module", "doc": "
\n"}, {"fullname": "debeir.datasets.marco.MarcoElasticsearchExecutor", "modulename": "debeir.datasets.marco", "qualname": "MarcoElasticsearchExecutor", "kind": "class", "doc": "Generic Executor class for Elasticsearch
\n", "bases": "debeir.core.executor.GenericElasticsearchExecutor"}, {"fullname": "debeir.datasets.marco.MarcoElasticsearchExecutor.__init__", "modulename": "debeir.datasets.marco", "qualname": "MarcoElasticsearchExecutor.__init__", "kind": "function", "doc": "
\n", "signature": "(\ttopics : Dict [ Union [ str , int ], Dict [ str , str ]] , \tclient : elasticsearch . AsyncElasticsearch , \tindex_name : str , \toutput_file : str , \tquery : debeir . core . query . GenericElasticsearchQuery , \tencoder : Optional [ debeir . rankers . transformer_sent_encoder . Encoder ] = None , \tconfig = None , \t* args , \t** kwargs ) "}, {"fullname": "debeir.datasets.marco.MarcoElasticsearchExecutor.generate_query", "modulename": "debeir.datasets.marco", "qualname": "MarcoElasticsearchExecutor.generate_query", "kind": "function", "doc": "Generates a standard BM25 query given the topic number
\n\nParameters \n\n\ntopic_num : Query topic number to generate \nbest_fields : Whether to use a curated list of fields \nkwargs : \n \n\nReturns \n", "signature": "(self , topic_num , best_fields = True , ** kwargs ): ", "funcdef": "def"}, {"fullname": "debeir.datasets.marco.MarcoElasticsearchExecutor.generate_embedding_query", "modulename": "debeir.datasets.marco", "qualname": "MarcoElasticsearchExecutor.generate_embedding_query", "kind": "function", "doc": "Executes an NIR-style query with combined scoring.
\n\nParameters \n\n\ntopic_num : \ncosine_weights : \nquery_weights : \nnorm_weight : \nautomatic_scores : \nkwargs : \n \n\nReturns \n", "signature": "(\tself , \ttopic_num , \tcosine_weights = None , \tquery_weights = None , \tnorm_weight = 2.15 , \tautomatic_scores = None , \t** kwargs ): ", "funcdef": "def"}, {"fullname": "debeir.datasets.marco.MarcoElasticsearchExecutor.execute_query", "modulename": "debeir.datasets.marco", "qualname": "MarcoElasticsearchExecutor.execute_query", "kind": "function", "doc": "Execute a query given parameters
\n\nParameters \n\n\n", "signature": "(\tself , \tquery = None , \ttopic_num = None , \tablation = False , \tquery_type = 'query' , \t** kwargs ): ", "funcdef": "async def"}, {"fullname": "debeir.datasets.marco.MarcoQueryConfig", "modulename": "debeir.datasets.marco", "qualname": "MarcoQueryConfig", "kind": "class", "doc": "
\n", "bases": "debeir.core.config.GenericConfig"}, {"fullname": "debeir.datasets.marco.MarcoQueryConfig.__init__", "modulename": "debeir.datasets.marco", "qualname": "MarcoQueryConfig.__init__", "kind": "function", "doc": "
\n", "signature": "(\tquery_type : str , \tindex : str = None , \tencoder_normalize : bool = True , \tablations : bool = False , \tnorm_weight : float = None , \tautomatic : bool = None , \tencoder : object = None , \tencoder_fp : str = None , \tquery_weights : List [ float ] = None , \tcosine_weights : List [ float ] = None , \tevaluate : bool = False , \tqrels : str = None , \tconfig_fn : str = None , \tquery_fn : str = None , \tparser_fn : str = None , \texecutor_fn : str = None , \tcosine_ceiling : float = None , \ttopics_path : str = None , \treturn_id_only : bool = False , \toverwrite_output_if_exists : bool = False , \toutput_file : str = None , \trun_name : str = None ) "}, {"fullname": "debeir.datasets.marco.MarcoQueryConfig.validate", "modulename": "debeir.datasets.marco", "qualname": "MarcoQueryConfig.validate", "kind": "function", "doc": "Validates if the config is correct.\nMust be implemented by inherited classes.
\n", "signature": "(self ): ", "funcdef": "def"}, {"fullname": "debeir.datasets.marco.MarcoQueryConfig.from_toml", "modulename": "debeir.datasets.marco", "qualname": "MarcoQueryConfig.from_toml", "kind": "function", "doc": "Instantiates a Config object from a toml file
\n\nParameters \n\n\nfp : File path of the Config TOML file \nfield_class : Class of the Config object to be instantiated \nargs : Arguments to be passed to Config \nkwargs : Keyword arguments to be passed \n \n\nReturns \n\n\nA instantiated and validated Config object.\n
\n \n", "signature": "(cls , fp : str , * args , ** kwargs ) -> debeir . datasets . marco . MarcoQueryConfig : ", "funcdef": "def"}, {"fullname": "debeir.datasets.marco.MarcoQueryConfig.from_dict", "modulename": "debeir.datasets.marco", "qualname": "MarcoQueryConfig.from_dict", "kind": "function", "doc": "Instantiates a Config object from a dictionary
\n\nParameters \n\n\ndata_class : \nkwargs : \n \n\nReturns \n", "signature": "(cls , ** kwargs ) -> debeir . datasets . marco . MarcoQueryConfig : ", "funcdef": "def"}, {"fullname": "debeir.datasets.trec_clinical_trials", "modulename": "debeir.datasets.trec_clinical_trials", "kind": "module", "doc": "
\n"}, {"fullname": "debeir.datasets.trec_clinical_trials.TREClinicalTrialDocumentParser", "modulename": "debeir.datasets.trec_clinical_trials", "qualname": "TREClinicalTrialDocumentParser", "kind": "class", "doc": "Parser for Clinical Trials topics
\n", "bases": "debeir.core.parser.XMLParser"}, {"fullname": "debeir.datasets.trec_clinical_trials.TREClinicalTrialDocumentParser.extract", "modulename": "debeir.datasets.trec_clinical_trials", "qualname": "TREClinicalTrialDocumentParser.extract", "kind": "function", "doc": "
\n", "signature": "(cls , path ) -> Dict : ", "funcdef": "def"}, {"fullname": "debeir.datasets.trec_clinical_trials.TrecClincialElasticsearchQuery", "modulename": "debeir.datasets.trec_clinical_trials", "qualname": "TrecClincialElasticsearchQuery", "kind": "class", "doc": "A generic elasticsearch query. Contains methods for NIR-style (embedding) queries and normal BM25 queries.\nRequires topics, configs to be included
\n", "bases": "debeir.core.query.GenericElasticsearchQuery"}, {"fullname": "debeir.datasets.trec_clinical_trials.TrecClincialElasticsearchQuery.__init__", "modulename": "debeir.datasets.trec_clinical_trials", "qualname": "TrecClincialElasticsearchQuery.__init__", "kind": "function", "doc": "
\n", "signature": "(topics , config , * args , ** kwargs ) "}, {"fullname": "debeir.datasets.trec_covid", "modulename": "debeir.datasets.trec_covid", "kind": "module", "doc": "
\n"}, {"fullname": "debeir.datasets.trec_covid.TrecCovidParser", "modulename": "debeir.datasets.trec_covid", "qualname": "TrecCovidParser", "kind": "class", "doc": "Load topics from an XML file
\n", "bases": "debeir.core.parser.XMLParser"}, {"fullname": "debeir.datasets.trec_covid.TrecCovidParser.get_topics", "modulename": "debeir.datasets.trec_covid", "qualname": "TrecCovidParser.get_topics", "kind": "function", "doc": "Instance method for getting topics, forwards instance self parameters to the _get_topics class method.
\n", "signature": "(cls , xmlfile ) -> Dict [ int , Dict [ str , str ]] : ", "funcdef": "def"}, {"fullname": "debeir.datasets.trec_covid.TrecElasticsearchQuery", "modulename": "debeir.datasets.trec_covid", "qualname": "TrecElasticsearchQuery", "kind": "class", "doc": "A generic elasticsearch query. Contains methods for NIR-style (embedding) queries and normal BM25 queries.\nRequires topics, configs to be included
\n", "bases": "debeir.core.query.GenericElasticsearchQuery"}, {"fullname": "debeir.datasets.trec_covid.TrecElasticsearchQuery.__init__", "modulename": "debeir.datasets.trec_covid", "qualname": "TrecElasticsearchQuery.__init__", "kind": "function", "doc": "
\n", "signature": "(topics , config , * args , ** kwargs ) "}, {"fullname": "debeir.datasets.types", "modulename": "debeir.datasets.types", "kind": "module", "doc": "
\n"}, {"fullname": "debeir.datasets.types.InputExample", "modulename": "debeir.datasets.types", "qualname": "InputExample", "kind": "class", "doc": "Copied from Sentence Transformer Library\nStructure for one input example with texts, the label and a unique id
\n"}, {"fullname": "debeir.datasets.types.InputExample.__init__", "modulename": "debeir.datasets.types", "qualname": "InputExample.__init__", "kind": "function", "doc": "Creates one InputExample with the given texts, guid and label
\n\n:param guid\n id for the example\n:param texts\n the texts for the example. Note, str.strip() is called on the texts\n:param label\n the label for the example
\n", "signature": "(\tguid : str = '' , \ttexts : List [ str ] = None , \tlabel : Union [ int , float ] = 0 ) "}, {"fullname": "debeir.datasets.types.InputExample.get_label", "modulename": "debeir.datasets.types", "qualname": "InputExample.get_label", "kind": "function", "doc": "
\n", "signature": "(self ): ", "funcdef": "def"}, {"fullname": "debeir.datasets.types.InputExample.to_dict", "modulename": "debeir.datasets.types", "qualname": "InputExample.to_dict", "kind": "function", "doc": "
\n", "signature": "(cls , data : List [ debeir . datasets . types . InputExample ] ): ", "funcdef": "def"}, {"fullname": "debeir.datasets.types.InputExample.from_parser_output", "modulename": "debeir.datasets.types", "qualname": "InputExample.from_parser_output", "kind": "function", "doc": "
\n", "signature": "(cls , data ): ", "funcdef": "def"}, {"fullname": "debeir.datasets.types.RelevanceExample", "modulename": "debeir.datasets.types", "qualname": "RelevanceExample", "kind": "class", "doc": "Converts Relevance Labels to 0 - 1
\n", "bases": "InputExample"}, {"fullname": "debeir.datasets.types.RelevanceExample.__init__", "modulename": "debeir.datasets.types", "qualname": "RelevanceExample.__init__", "kind": "function", "doc": "Creates one InputExample with the given texts, guid and label
\n\n:param guid\n id for the example\n:param texts\n the texts for the example. Note, str.strip() is called on the texts\n:param label\n the label for the example
\n", "signature": "(max_score = 2 , * args , ** kwargs ) "}, {"fullname": "debeir.datasets.types.RelevanceExample.get_label", "modulename": "debeir.datasets.types", "qualname": "RelevanceExample.get_label", "kind": "function", "doc": "
\n", "signature": "(self ): ", "funcdef": "def"}, {"fullname": "debeir.datasets.types.RelevanceExample.relevance", "modulename": "debeir.datasets.types", "qualname": "RelevanceExample.relevance", "kind": "function", "doc": "Returns \n\n\nReturns a normalised score for relevance between 0 - 1\n
\n \n", "signature": "(self ): ", "funcdef": "def"}, {"fullname": "debeir.datasets.types.DatasetTypes", "modulename": "debeir.datasets.types", "qualname": "DatasetTypes", "kind": "class", "doc": "A collection of common dataset types that is usable in the library.
\n", "bases": "enum.Enum"}, {"fullname": "debeir.datasets.utils", "modulename": "debeir.datasets.utils", "kind": "module", "doc": "
\n"}, {"fullname": "debeir.datasets.utils.CrossValidatorDataset", "modulename": "debeir.datasets.utils", "qualname": "CrossValidatorDataset", "kind": "class", "doc": "Cross Validator Dataset
\n"}, {"fullname": "debeir.datasets.utils.CrossValidatorDataset.__init__", "modulename": "debeir.datasets.utils", "qualname": "CrossValidatorDataset.__init__", "kind": "function", "doc": "
\n", "signature": "(dataset , cross_validator , n_folds , x_attr = 'text' , y_attr = 'label' ) "}, {"fullname": "debeir.datasets.utils.CrossValidatorDataset.prepare_cross_validator", "modulename": "debeir.datasets.utils", "qualname": "CrossValidatorDataset.prepare_cross_validator", "kind": "function", "doc": "Prepare the cross validator dataset object that will internally produce the folds.
\n\nParameters \n\n\ndata : Dataset to be used. Should be a list of dicts, or list of [x,y] or a Dataset object from data_sets \nevaluator : Evaluator to use for checking results \nn_splits : Number of cross validation splits, k-fold (stratified) \nseed : Seed to use (default 42) \ny_attr : Label, or idx of the y label \nx_attr : Label or idx of the x label (not directly used) \n \n", "signature": "(\tcls , \tdata , \tevaluator : debeir . evaluation . evaluator . Evaluator , \tn_splits : int , \tx_attr , \ty_attr , \tseed = 42 ) -> debeir . datasets . utils . CrossValidatorDataset : ", "funcdef": "def"}, {"fullname": "debeir.datasets.utils.CrossValidatorDataset.get_fold", "modulename": "debeir.datasets.utils", "qualname": "CrossValidatorDataset.get_fold", "kind": "function", "doc": "Get the fold and returns a dataset.DataDict object with\nDataDict{'train': ..., 'val': ...}
\n\nParameters \n\n\n", "signature": "(self , idx ) -> datasets . dataset_dict . DatasetDict : ", "funcdef": "def"}, {"fullname": "debeir.engines", "modulename": "debeir.engines", "kind": "module", "doc": "WIP
\n\nImplemented Search Engines to run queries against.
\n"}, {"fullname": "debeir.engines.client", "modulename": "debeir.engines.client", "kind": "module", "doc": "
\n"}, {"fullname": "debeir.engines.client.Client", "modulename": "debeir.engines.client", "qualname": "Client", "kind": "class", "doc": "Overarching client interface object that contains references to different clients for search\nAllows sharing between function calls
\n"}, {"fullname": "debeir.engines.client.Client.__init__", "modulename": "debeir.engines.client", "qualname": "Client.__init__", "kind": "function", "doc": "
\n", "signature": "(\tes_client : elasticsearch . AsyncElasticsearch = None , \tsolr_client : object = None , \tgeneric_client : object = None ) "}, {"fullname": "debeir.engines.client.Client.build_from_config", "modulename": "debeir.engines.client", "qualname": "Client.build_from_config", "kind": "function", "doc": "Build client from engine config
\n\nParameters \n\n\nengine_type : \nengine_config : \n \n\nReturns \n", "signature": "(cls , engine_type , engine_config ) -> debeir . engines . client . Client : ", "funcdef": "def"}, {"fullname": "debeir.engines.client.Client.get_client", "modulename": "debeir.engines.client", "qualname": "Client.get_client", "kind": "function", "doc": "
\n", "signature": "(self , engine ): ", "funcdef": "def"}, {"fullname": "debeir.engines.client.Client.close", "modulename": "debeir.engines.client", "qualname": "Client.close", "kind": "function", "doc": "Generically close all contained client objects
\n", "signature": "(self ): ", "funcdef": "async def"}, {"fullname": "debeir.engines.dummyindex", "modulename": "debeir.engines.dummyindex", "kind": "module", "doc": "
\n"}, {"fullname": "debeir.engines.dummyindex.index", "modulename": "debeir.engines.dummyindex.index", "kind": "module", "doc": "
\n"}, {"fullname": "debeir.engines.dummyindex.index.DummyIndex", "modulename": "debeir.engines.dummyindex.index", "qualname": "DummyIndex", "kind": "class", "doc": "
\n"}, {"fullname": "debeir.engines.dummyindex.index.DummyIndex.__init__", "modulename": "debeir.engines.dummyindex.index", "qualname": "DummyIndex.__init__", "kind": "function", "doc": "
\n", "signature": "() "}, {"fullname": "debeir.engines.dummyindex.index.DummyIndex.get_documents", "modulename": "debeir.engines.dummyindex.index", "qualname": "DummyIndex.get_documents", "kind": "function", "doc": "
\n", "signature": "(self ): ", "funcdef": "def"}, {"fullname": "debeir.engines.dummyindex.index.DummyIndex.query", "modulename": "debeir.engines.dummyindex.index", "qualname": "DummyIndex.query", "kind": "function", "doc": "
\n", "signature": "(self ): ", "funcdef": "def"}, {"fullname": "debeir.engines.dummyindex.index.DummyIndex.scorer", "modulename": "debeir.engines.dummyindex.index", "qualname": "DummyIndex.scorer", "kind": "function", "doc": "
\n", "signature": "(self ): ", "funcdef": "def"}, {"fullname": "debeir.engines.dummyindex.index.es_isup", "modulename": "debeir.engines.dummyindex.index", "qualname": "es_isup", "kind": "function", "doc": "
\n", "signature": "(es_client : elasticsearch . AsyncElasticsearch ): ", "funcdef": "async def"}, {"fullname": "debeir.engines.elasticsearch", "modulename": "debeir.engines.elasticsearch", "kind": "module", "doc": "Library code for interacting with the elasticsearch engine
\n\nContains many helper functions for asynchronous and fast querying, with optional caching available
\n"}, {"fullname": "debeir.engines.elasticsearch.change_bm25", "modulename": "debeir.engines.elasticsearch.change_bm25", "kind": "module", "doc": "
\n"}, {"fullname": "debeir.engines.elasticsearch.change_bm25.change_bm25_params", "modulename": "debeir.engines.elasticsearch.change_bm25", "qualname": "change_bm25_params", "kind": "function", "doc": "Change the BM25 parameters of the elasticsearch BM25 ranker.
\n\nParameters \n\n\nindex : The elasticsearch index name \nk1 : The k parameter for BM25 (default 1.2) [Usually 0-3] [Term saturation constant] ->\nThe higher the k value, the more weight given to document that repeat terms. \nb : The b parameter for BM25 (default 0.75) [Usually 0-1] [Document length constant] ->\nThe higher the b value, the higher it penalises longer documents. \nbase_url : The elasticsearch base URL for API requests (without index suffix) \n \n", "signature": "(index , k1 : float , b : float , base_url : str = 'http://localhost:9200' ): ", "funcdef": "def"}, {"fullname": "debeir.engines.elasticsearch.executor", "modulename": "debeir.engines.elasticsearch.executor", "kind": "module", "doc": "
\n"}, {"fullname": "debeir.engines.elasticsearch.executor.ElasticsearchExecutor", "modulename": "debeir.engines.elasticsearch.executor", "qualname": "ElasticsearchExecutor", "kind": "class", "doc": "Executes an elasticsearch query given the query generated from the config, topics and query class object.
\n\nComputes regular patterns of queries expected from general IR topics and indexes.\nIncludes:\n 1. Reranking\n 2. End-to-End Neural IR\n 3. Statistical keyword matching
\n"}, {"fullname": "debeir.engines.elasticsearch.executor.ElasticsearchExecutor.__init__", "modulename": "debeir.engines.elasticsearch.executor", "qualname": "ElasticsearchExecutor.__init__", "kind": "function", "doc": "
\n", "signature": "(\ttopics : Dict [ Union [ str , int ], Dict [ str , str ]] , \tclient : elasticsearch . AsyncElasticsearch , \tindex_name : str , \toutput_file : str , \tquery : debeir . core . query . GenericElasticsearchQuery , \tencoder : Optional [ debeir . rankers . transformer_sent_encoder . Encoder ] , \treturn_size : int = 1000 , \ttest = False , \treturn_id_only = True , \tconfig = None ) "}, {"fullname": "debeir.engines.elasticsearch.executor.ElasticsearchExecutor.generate_query", "modulename": "debeir.engines.elasticsearch.executor", "qualname": "ElasticsearchExecutor.generate_query", "kind": "function", "doc": "Generates a query given a topic number from the list of topics
\n\nParameters \n\n\n", "signature": "(self , topic_num ): ", "funcdef": "def"}, {"fullname": "debeir.engines.elasticsearch.executor.ElasticsearchExecutor.execute_query", "modulename": "debeir.engines.elasticsearch.executor", "qualname": "ElasticsearchExecutor.execute_query", "kind": "function", "doc": "Execute a query given parameters
\n\nParameters \n\n\n", "signature": "(self , * args , ** kwargs ): ", "funcdef": "def"}, {"fullname": "debeir.engines.elasticsearch.executor.ElasticsearchExecutor.run_all_queries", "modulename": "debeir.engines.elasticsearch.executor", "qualname": "ElasticsearchExecutor.run_all_queries", "kind": "function", "doc": "A generic function that will asynchronously run all topics using the execute_query() method
\n\nParameters \n\n\nquery_type : Which query to execute. Query_type determines which method is used to generate the queries\nfrom self.query.query_funcs: Dict[str, func] \nreturn_results : Whether to return raw results from the client. Useful for analysing results directly or\nfor computing the BM25 scores for log normalization in NIR-style scoring \nreturn_size : Number of documents to return. Overrides the config value if exists. \nreturn_id_only : Return the ID of the document only, rather than the full source document. \nargs : Arguments to pass to the execute_query method \nkwargs : Keyword arguments to pass to the execute_query method \n \n\nReturns \n\n\nA list of results if return_results = True else an empty list is returned.\n
\n \n", "signature": "(\tself , \tquery_type = None , \treturn_results = False , \treturn_size : int = None , \treturn_id_only : bool = False , \t** kwargs ) -> List : ", "funcdef": "async def"}, {"fullname": "debeir.engines.elasticsearch.generate_script_score", "modulename": "debeir.engines.elasticsearch.generate_script_score", "kind": "module", "doc": "
\n"}, {"fullname": "debeir.engines.elasticsearch.generate_script_score.SourceBuilder", "modulename": "debeir.engines.elasticsearch.generate_script_score", "qualname": "SourceBuilder", "kind": "class", "doc": "Builds Script Score source for NIR-style queries in elasticsearch\nUses the painless language
\n\nThis is a string builder class
\n"}, {"fullname": "debeir.engines.elasticsearch.generate_script_score.SourceBuilder.__init__", "modulename": "debeir.engines.elasticsearch.generate_script_score", "qualname": "SourceBuilder.__init__", "kind": "function", "doc": "
\n", "signature": "() "}, {"fullname": "debeir.engines.elasticsearch.generate_script_score.SourceBuilder.add_preamble", "modulename": "debeir.engines.elasticsearch.generate_script_score", "qualname": "SourceBuilder.add_preamble", "kind": "function", "doc": "Adds preamble to the internal string\nThis will return the bm25 score if the normalization constant is below 0
\n", "signature": "(self ): ", "funcdef": "def"}, {"fullname": "debeir.engines.elasticsearch.generate_script_score.SourceBuilder.add_log_score", "modulename": "debeir.engines.elasticsearch.generate_script_score", "qualname": "SourceBuilder.add_log_score", "kind": "function", "doc": "Adds the BM25 log score line
\n\nParameters \n\n\nignore_below_one : Ignore all scores below 1.0 as Log(1) = 0. Otherwise, just ignore Log(0 and under). \n \n\nReturns \n\n\nSourceBuilder\n
\n \n", "signature": "(\tself , \tignore_below_one = False ) -> debeir . engines . elasticsearch . generate_script_score . SourceBuilder : ", "funcdef": "def"}, {"fullname": "debeir.engines.elasticsearch.generate_script_score.SourceBuilder.add_embed_field", "modulename": "debeir.engines.elasticsearch.generate_script_score", "qualname": "SourceBuilder.add_embed_field", "kind": "function", "doc": "Adds a cosine score line.
\n\nParameters \n\n\nqfield : Query field \nfield : Document facet field \n \n\nReturns \n", "signature": "(\tself , \tqfield , \tfield ) -> debeir . engines . elasticsearch . generate_script_score . SourceBuilder : ", "funcdef": "def"}, {"fullname": "debeir.engines.elasticsearch.generate_script_score.SourceBuilder.finish", "modulename": "debeir.engines.elasticsearch.generate_script_score", "qualname": "SourceBuilder.finish", "kind": "function", "doc": "Finalises the script score and returns the internal string
\n\nReturns \n\n\nA string containing the script score query\n
\n \n", "signature": "(self ): ", "funcdef": "def"}, {"fullname": "debeir.engines.elasticsearch.generate_script_score.generate_source", "modulename": "debeir.engines.elasticsearch.generate_script_score", "qualname": "generate_source", "kind": "function", "doc": "Generates the script source based off a set of input fields and facets
\n\nParameters \n\n\nqfields : Query fields (or topic fields) \nfields : Document facets to compute cosine similarity on \n \n\nReturns \n", "signature": "(qfields : Union [ list , str ] , fields ) -> str : ", "funcdef": "def"}, {"fullname": "debeir.engines.elasticsearch.generate_script_score.check_params_is_valid", "modulename": "debeir.engines.elasticsearch.generate_script_score", "qualname": "check_params_is_valid", "kind": "function", "doc": "Validate if the parameters for the script score passes a simple sanity check.
\n\nParameters \n\n\n", "signature": "(params , qfields ): ", "funcdef": "def"}, {"fullname": "debeir.engines.elasticsearch.generate_script_score.generate_script", "modulename": "debeir.engines.elasticsearch.generate_script_score", "qualname": "generate_script", "kind": "function", "doc": "Parameters for creating the script
\n\nParameters \n\n\nfields : Document fields to search \nparams : Parameters for the script \nsource_generator : Function that will generate the script \nqfields : Query fields to search from (topic facets) \n \n\nReturns \n", "signature": "(\tfields , \tparams , \tsource_generator =< function generate_source > , \tqfields = 'q_eb' ) -> Dict : ", "funcdef": "def"}, {"fullname": "debeir.engines.solr", "modulename": "debeir.engines.solr", "kind": "module", "doc": "
\n"}, {"fullname": "debeir.evaluation", "modulename": "debeir.evaluation", "kind": "module", "doc": "Evaluation for retrieved results.
\n\nWorks for TREC-style queries or for out-the-box returned results from the implemented search engines.
\n"}, {"fullname": "debeir.evaluation.cross_validation", "modulename": "debeir.evaluation.cross_validation", "kind": "module", "doc": "
\n"}, {"fullname": "debeir.evaluation.cross_validation.split_k_fold", "modulename": "debeir.evaluation.cross_validation", "qualname": "split_k_fold", "kind": "function", "doc": "
\n", "signature": "(n_fold , data_files ): ", "funcdef": "def"}, {"fullname": "debeir.evaluation.cross_validation.CrossValidatorTypes", "modulename": "debeir.evaluation.cross_validation", "qualname": "CrossValidatorTypes", "kind": "class", "doc": "Cross Validator Strategies for separating the dataset
\n", "bases": "enum.Enum"}, {"fullname": "debeir.evaluation.cross_validation.CrossValidatorTypes.Stratified", "modulename": "debeir.evaluation.cross_validation", "qualname": "CrossValidatorTypes.Stratified", "kind": "variable", "doc": "
\n", "default_value": " = <CrossValidatorTypes.Stratified: 'StratifiedKFold'>"}, {"fullname": "debeir.evaluation.cross_validation.CrossValidatorTypes.KFold", "modulename": "debeir.evaluation.cross_validation", "qualname": "CrossValidatorTypes.KFold", "kind": "variable", "doc": "
\n", "default_value": " = <CrossValidatorTypes.KFold: 'KFold'>"}, {"fullname": "debeir.evaluation.cross_validation.CrossValidator", "modulename": "debeir.evaluation.cross_validation", "qualname": "CrossValidator", "kind": "class", "doc": "Cross Validator Class for different types of data_sets
\n\nE.g. List -> [[Data], label]\n List[Dict] -> {\"data\": Data, \"label\": label}\n Huggingface Dataset Object -> Data(set=\"train\", label = \"label\").select(idx)
\n"}, {"fullname": "debeir.evaluation.cross_validation.CrossValidator.__init__", "modulename": "debeir.evaluation.cross_validation", "qualname": "CrossValidator.__init__", "kind": "function", "doc": "
\n", "signature": "(\tdataset : Union [ List , List [ Dict ], datasets . arrow_dataset . Dataset ] , \tx_idx_label_or_attr : Union [ str , int ] , \ty_idx_label_or_attr : Union [ str , int ] , \tcross_validator_type: [<class 'str'>, <enum 'CrossValidatorTypes'>] = <CrossValidatorTypes.Stratified: 'StratifiedKFold'>, \tseed = 42 , \tn_splits = 5 ) "}, {"fullname": "debeir.evaluation.cross_validation.CrossValidator.get_fold", "modulename": "debeir.evaluation.cross_validation", "qualname": "CrossValidator.get_fold", "kind": "function", "doc": "Parameters \n\n\nfold_num : Which fold to pick \n \n\nReturns \n", "signature": "(self , fold_num : int ): ", "funcdef": "def"}, {"fullname": "debeir.evaluation.evaluator", "modulename": "debeir.evaluation.evaluator", "kind": "module", "doc": "
\n"}, {"fullname": "debeir.evaluation.evaluator.Evaluator", "modulename": "debeir.evaluation.evaluator", "qualname": "Evaluator", "kind": "class", "doc": "Evaluation class for computing metrics from TREC-style files
\n"}, {"fullname": "debeir.evaluation.evaluator.Evaluator.__init__", "modulename": "debeir.evaluation.evaluator", "qualname": "Evaluator.__init__", "kind": "function", "doc": "
\n", "signature": "(qrels : str , metrics : List [ str ] ) "}, {"fullname": "debeir.evaluation.evaluator.Evaluator.evaluate_runs", "modulename": "debeir.evaluation.evaluator", "qualname": "Evaluator.evaluate_runs", "kind": "function", "doc": "Evaluates the TREC-style results from an input result list or file
\n\nParameters \n\n\nres : Results file path or raw results list \nkwargs : Keyword arguments to pass to the underlying analysis_tools_ir.parse_run library \n \n\nReturns \n", "signature": "(self , res : Union [ str , List [ str ]] , ** kwargs ): ", "funcdef": "def"}, {"fullname": "debeir.evaluation.evaluator.Evaluator.average_all_metrics", "modulename": "debeir.evaluation.evaluator", "qualname": "Evaluator.average_all_metrics", "kind": "function", "doc": "Averages the metric per topic scores into a single averaged score.
\n\nParameters \n\n\nruns: Parsed run dictionary: {metric_name@depth : Run object} \nlogger : Logger to print metrics \n \n", "signature": "(\tself , \truns : Dict , \tlogger : < loguru . logger handlers = [( id = 0 , level = 10 , sink =< _io . StringIO object at 0x103af2710 > )] > ): ", "funcdef": "def"}, {"fullname": "debeir.evaluation.evaluator.Evaluator.sigtests", "modulename": "debeir.evaluation.evaluator", "qualname": "Evaluator.sigtests", "kind": "function", "doc": "Run a paired significance test on two result files
\n\nParameters \n\n\nresults_a : \nresults_b : \n \n\nReturns \n", "signature": "(self , results_a , results_b ): ", "funcdef": "def"}, {"fullname": "debeir.evaluation.evaluator.Evaluator.build_from_config", "modulename": "debeir.evaluation.evaluator", "qualname": "Evaluator.build_from_config", "kind": "function", "doc": "
\n", "signature": "(\tcls , \tconfig : debeir . core . config . GenericConfig , \tmetrics_config : debeir . core . config . MetricsConfig ): ", "funcdef": "def"}, {"fullname": "debeir.evaluation.residual_scoring", "modulename": "debeir.evaluation.residual_scoring", "kind": "module", "doc": "
\n"}, {"fullname": "debeir.evaluation.residual_scoring.ResidualEvaluator", "modulename": "debeir.evaluation.residual_scoring", "qualname": "ResidualEvaluator", "kind": "class", "doc": "Residual Scoring is the scoring of a subset of documents or the residiaul. The residual is created by removing documents from the collection and qrels.
\n", "bases": "debeir.evaluation.evaluator.Evaluator"}, {"fullname": "debeir.evaluation.residual_scoring.ResidualEvaluator.__init__", "modulename": "debeir.evaluation.residual_scoring", "qualname": "ResidualEvaluator.__init__", "kind": "function", "doc": "Args:\n qrels (str): Path to qrels \n metrics (List[str]): A list of metrics with depth e.g. NDCG@1000\n filter_ids (Dict[str, List[str]]): A list of IDs to remove from the collection given by Dict[Topic_num, [Docids]]
\n", "signature": "(qrels : str , metrics : List [ str ] , filter_ids : Dict [ str , List [ str ]] ) "}, {"fullname": "debeir.evaluation.residual_scoring.ResidualEvaluator.evaluate_runs", "modulename": "debeir.evaluation.residual_scoring", "qualname": "ResidualEvaluator.evaluate_runs", "kind": "function", "doc": "Run the residual evaluation for the runs
\n\nParameters \n\n\nres : The results to run the evaluator against \nwith_trec_binary : Use the TREC C binary instead of the default Python library, defaults to False \n \n\nReturns \n\n\n A dictionary of supplied metrics of the results against the qrels
\n \n", "signature": "(self , res : Union [ str , List [ str ]] , with_trec_binary = False , ** kwargs ): ", "funcdef": "def"}, {"fullname": "debeir.models", "modulename": "debeir.models", "kind": "module", "doc": "
\n"}, {"fullname": "debeir.models.colbert", "modulename": "debeir.models.colbert", "kind": "module", "doc": "
\n"}, {"fullname": "debeir.models.colbert.CoLBERTConfig", "modulename": "debeir.models.colbert", "qualname": "CoLBERTConfig", "kind": "class", "doc": "
\n"}, {"fullname": "debeir.models.colbert.CoLBERTConfig.__init__", "modulename": "debeir.models.colbert", "qualname": "CoLBERTConfig.__init__", "kind": "function", "doc": "
\n", "signature": "(** kwargs ) "}, {"fullname": "debeir.models.colbert.CoLBERTConfig.save", "modulename": "debeir.models.colbert", "qualname": "CoLBERTConfig.save", "kind": "function", "doc": "Parameters \n\n\nfname : file name \npath : Path to save \n \n", "signature": "(self , path , fname = 'colbert_config.json' ): ", "funcdef": "def"}, {"fullname": "debeir.models.colbert.CoLBERTConfig.load", "modulename": "debeir.models.colbert", "qualname": "CoLBERTConfig.load", "kind": "function", "doc": "Load the ColBERT config from path (don't point to file name just directory)
\n\nReturns \n", "signature": "(cls , path , fname = 'colbert_config.json' ): ", "funcdef": "def"}, {"fullname": "debeir.models.colbert.ConvolutionalBlock", "modulename": "debeir.models.colbert", "qualname": "ConvolutionalBlock", "kind": "class", "doc": "Base class for all neural network modules.
\n\nYour models should also subclass this class.
\n\nModules can also contain other Modules, allowing to nest them in\na tree structure. You can assign the submodules as regular attributes::
\n\nimport torch.nn as nn\nimport torch.nn.functional as F\n\nclass Model(nn.Module):\n def __init__(self):\n super().__init__()\n self.conv1 = nn.Conv2d(1, 20, 5)\n self.conv2 = nn.Conv2d(20, 20, 5)\n\n def forward(self, x):\n x = F.relu(self.conv1(x))\n return F.relu(self.conv2(x))\n
\n\nSubmodules assigned in this way will be registered, and will have their\nparameters converted too when you call to()
, etc.
\n\n\n\n
As per the example above, an __init__()
call to the parent class\nmust be made before assignment on the child.
\n\n
\n\n:ivar training: Boolean represents whether this module is in training or\n evaluation mode.\n:vartype training: bool
\n", "bases": "torch.nn.modules.module.Module"}, {"fullname": "debeir.models.colbert.ConvolutionalBlock.__init__", "modulename": "debeir.models.colbert", "qualname": "ConvolutionalBlock.__init__", "kind": "function", "doc": "Initializes internal Module state, shared by both nn.Module and ScriptModule.
\n", "signature": "(\tin_channels , \tout_channels , \tkernel_size = 1 , \tfirst_stride = 1 , \tact_func=<class 'torch.nn.modules.activation.ReLU'> ) "}, {"fullname": "debeir.models.colbert.ConvolutionalBlock.forward", "modulename": "debeir.models.colbert", "qualname": "ConvolutionalBlock.forward", "kind": "function", "doc": "Defines the computation performed at every call.
\n\nShould be overridden by all subclasses.
\n\n\n\n
Although the recipe for forward pass needs to be defined within\nthis function, one should call the Module
instance afterwards\ninstead of this since the former takes care of running the\nregistered hooks while the latter silently ignores them.
\n\n
\n", "signature": "(self , x ): ", "funcdef": "def"}, {"fullname": "debeir.models.colbert.KMaxPool", "modulename": "debeir.models.colbert", "qualname": "KMaxPool", "kind": "class", "doc": "Base class for all neural network modules.
\n\nYour models should also subclass this class.
\n\nModules can also contain other Modules, allowing to nest them in\na tree structure. You can assign the submodules as regular attributes::
\n\nimport torch.nn as nn\nimport torch.nn.functional as F\n\nclass Model(nn.Module):\n def __init__(self):\n super().__init__()\n self.conv1 = nn.Conv2d(1, 20, 5)\n self.conv2 = nn.Conv2d(20, 20, 5)\n\n def forward(self, x):\n x = F.relu(self.conv1(x))\n return F.relu(self.conv2(x))\n
\n\nSubmodules assigned in this way will be registered, and will have their\nparameters converted too when you call to()
, etc.
\n\n\n\n
As per the example above, an __init__()
call to the parent class\nmust be made before assignment on the child.
\n\n
\n\n:ivar training: Boolean represents whether this module is in training or\n evaluation mode.\n:vartype training: bool
\n", "bases": "torch.nn.modules.module.Module"}, {"fullname": "debeir.models.colbert.KMaxPool.__init__", "modulename": "debeir.models.colbert", "qualname": "KMaxPool.__init__", "kind": "function", "doc": "Initializes internal Module state, shared by both nn.Module and ScriptModule.
\n", "signature": "(k = 1 ) "}, {"fullname": "debeir.models.colbert.KMaxPool.forward", "modulename": "debeir.models.colbert", "qualname": "KMaxPool.forward", "kind": "function", "doc": "Defines the computation performed at every call.
\n\nShould be overridden by all subclasses.
\n\n\n\n
Although the recipe for forward pass needs to be defined within\nthis function, one should call the Module
instance afterwards\ninstead of this since the former takes care of running the\nregistered hooks while the latter silently ignores them.
\n\n
\n", "signature": "(self , x ): ", "funcdef": "def"}, {"fullname": "debeir.models.colbert.visualisation_dump", "modulename": "debeir.models.colbert", "qualname": "visualisation_dump", "kind": "function", "doc": "
\n", "signature": "(argmax , input_tensors ): ", "funcdef": "def"}, {"fullname": "debeir.models.colbert.ResidualBlock", "modulename": "debeir.models.colbert", "qualname": "ResidualBlock", "kind": "class", "doc": "Base class for all neural network modules.
\n\nYour models should also subclass this class.
\n\nModules can also contain other Modules, allowing to nest them in\na tree structure. You can assign the submodules as regular attributes::
\n\nimport torch.nn as nn\nimport torch.nn.functional as F\n\nclass Model(nn.Module):\n def __init__(self):\n super().__init__()\n self.conv1 = nn.Conv2d(1, 20, 5)\n self.conv2 = nn.Conv2d(20, 20, 5)\n\n def forward(self, x):\n x = F.relu(self.conv1(x))\n return F.relu(self.conv2(x))\n
\n\nSubmodules assigned in this way will be registered, and will have their\nparameters converted too when you call to()
, etc.
\n\n\n\n
As per the example above, an __init__()
call to the parent class\nmust be made before assignment on the child.
\n\n
\n\n:ivar training: Boolean represents whether this module is in training or\n evaluation mode.\n:vartype training: bool
\n", "bases": "torch.nn.modules.module.Module"}, {"fullname": "debeir.models.colbert.ResidualBlock.__init__", "modulename": "debeir.models.colbert", "qualname": "ResidualBlock.__init__", "kind": "function", "doc": "Initializes internal Module state, shared by both nn.Module and ScriptModule.
\n", "signature": "(\tin_channels , \tout_channels , \toptional_shortcut = True , \tkernel_size = 1 , \tact_func=<class 'torch.nn.modules.activation.ReLU'> ) "}, {"fullname": "debeir.models.colbert.ResidualBlock.forward", "modulename": "debeir.models.colbert", "qualname": "ResidualBlock.forward", "kind": "function", "doc": "Defines the computation performed at every call.
\n\nShould be overridden by all subclasses.
\n\n\n\n
Although the recipe for forward pass needs to be defined within\nthis function, one should call the Module
instance afterwards\ninstead of this since the former takes care of running the\nregistered hooks while the latter silently ignores them.
\n\n
\n", "signature": "(self , x ): ", "funcdef": "def"}, {"fullname": "debeir.models.colbert.ColBERT", "modulename": "debeir.models.colbert", "qualname": "ColBERT", "kind": "class", "doc": "Base class for all neural network modules.
\n\nYour models should also subclass this class.
\n\nModules can also contain other Modules, allowing to nest them in\na tree structure. You can assign the submodules as regular attributes::
\n\nimport torch.nn as nn\nimport torch.nn.functional as F\n\nclass Model(nn.Module):\n def __init__(self):\n super().__init__()\n self.conv1 = nn.Conv2d(1, 20, 5)\n self.conv2 = nn.Conv2d(20, 20, 5)\n\n def forward(self, x):\n x = F.relu(self.conv1(x))\n return F.relu(self.conv2(x))\n
\n\nSubmodules assigned in this way will be registered, and will have their\nparameters converted too when you call to()
, etc.
\n\n\n\n
As per the example above, an __init__()
call to the parent class\nmust be made before assignment on the child.
\n\n
\n\n:ivar training: Boolean represents whether this module is in training or\n evaluation mode.\n:vartype training: bool
\n", "bases": "torch.nn.modules.module.Module"}, {"fullname": "debeir.models.colbert.ColBERT.__init__", "modulename": "debeir.models.colbert", "qualname": "ColBERT.__init__", "kind": "function", "doc": "Initializes internal Module state, shared by both nn.Module and ScriptModule.
\n", "signature": "(\tbert_model_args , \tbert_model_kwargs , \tconfig : transformers . models . bert . configuration_bert . BertConfig , \tdevice : str , \tmax_seq_len : int = 128 , \tk : int = 8 , \toptional_shortcut : bool = True , \thidden_neurons : int = 2048 , \tuse_batch_norms : bool = True , \tuse_trans_blocks : bool = False , \tresidual_kernel_size : int = 1 , \tdropout_perc : float = 0.5 , \tact_func = 'mish' , \tloss_func = 'cross_entropy_loss' , \t** kwargs ) "}, {"fullname": "debeir.models.colbert.ColBERT.forward", "modulename": "debeir.models.colbert", "qualname": "ColBERT.forward", "kind": "function", "doc": "Defines the computation performed at every call.
\n\nShould be overridden by all subclasses.
\n\n\n\n
Although the recipe for forward pass needs to be defined within\nthis function, one should call the Module
instance afterwards\ninstead of this since the former takes care of running the\nregistered hooks while the latter silently ignores them.
\n\n
\n", "signature": "(self , * args , ** kwargs ): ", "funcdef": "def"}, {"fullname": "debeir.models.colbert.ColBERT.from_config", "modulename": "debeir.models.colbert", "qualname": "ColBERT.from_config", "kind": "function", "doc": "
\n", "signature": "(cls , * args , config_path ): ", "funcdef": "def"}, {"fullname": "debeir.models.colbert.ColBERT.from_pretrained", "modulename": "debeir.models.colbert", "qualname": "ColBERT.from_pretrained", "kind": "function", "doc": "
\n", "signature": "(cls , output_dir , ** kwargs ): ", "funcdef": "def"}, {"fullname": "debeir.models.colbert.ColBERT.save_pretrained", "modulename": "debeir.models.colbert", "qualname": "ColBERT.save_pretrained", "kind": "function", "doc": "
\n", "signature": "(self , output_dir ): ", "funcdef": "def"}, {"fullname": "debeir.models.colbert.ComBERT", "modulename": "debeir.models.colbert", "qualname": "ComBERT", "kind": "class", "doc": "Base class for all neural network modules.
\n\nYour models should also subclass this class.
\n\nModules can also contain other Modules, allowing to nest them in\na tree structure. You can assign the submodules as regular attributes::
\n\nimport torch.nn as nn\nimport torch.nn.functional as F\n\nclass Model(nn.Module):\n def __init__(self):\n super().__init__()\n self.conv1 = nn.Conv2d(1, 20, 5)\n self.conv2 = nn.Conv2d(20, 20, 5)\n\n def forward(self, x):\n x = F.relu(self.conv1(x))\n return F.relu(self.conv2(x))\n
\n\nSubmodules assigned in this way will be registered, and will have their\nparameters converted too when you call to()
, etc.
\n\n\n\n
As per the example above, an __init__()
call to the parent class\nmust be made before assignment on the child.
\n\n
\n\n:ivar training: Boolean represents whether this module is in training or\n evaluation mode.\n:vartype training: bool
\n", "bases": "torch.nn.modules.module.Module"}, {"fullname": "debeir.models.colbert.ComBERT.__init__", "modulename": "debeir.models.colbert", "qualname": "ComBERT.__init__", "kind": "function", "doc": "Initializes internal Module state, shared by both nn.Module and ScriptModule.
\n", "signature": "(\tbert_model_args , \tbert_model_kwargs , \tconfig : transformers . models . bert . configuration_bert . BertConfig , \tdevice : str , \tmax_seq_len : int = 128 , \tk : int = 8 , \toptional_shortcut : bool = True , \thidden_neurons : int = 2048 , \tuse_batch_norms : bool = True , \tuse_trans_blocks : bool = False , \tresidual_kernel_size : int = 1 , \tdropout_perc : float = 0.5 , \tact_func = 'mish' , \tloss_func = 'cross_entropy_loss' , \tnum_blocks = 2 , \t** kwargs ) "}, {"fullname": "debeir.models.colbert.ComBERT.forward", "modulename": "debeir.models.colbert", "qualname": "ComBERT.forward", "kind": "function", "doc": "Defines the computation performed at every call.
\n\nShould be overridden by all subclasses.
\n\n\n\n
Although the recipe for forward pass needs to be defined within\nthis function, one should call the Module
instance afterwards\ninstead of this since the former takes care of running the\nregistered hooks while the latter silently ignores them.
\n\n
\n", "signature": "(self , * args , ** kwargs ): ", "funcdef": "def"}, {"fullname": "debeir.models.colbert.ComBERT.from_config", "modulename": "debeir.models.colbert", "qualname": "ComBERT.from_config", "kind": "function", "doc": "
\n", "signature": "(cls , * args , config_path ): ", "funcdef": "def"}, {"fullname": "debeir.models.colbert.ComBERT.from_pretrained", "modulename": "debeir.models.colbert", "qualname": "ComBERT.from_pretrained", "kind": "function", "doc": "
\n", "signature": "(cls , output_dir , ** kwargs ): ", "funcdef": "def"}, {"fullname": "debeir.models.colbert.ComBERT.save_pretrained", "modulename": "debeir.models.colbert", "qualname": "ComBERT.save_pretrained", "kind": "function", "doc": "
\n", "signature": "(self , output_dir ): ", "funcdef": "def"}, {"fullname": "debeir.rankers", "modulename": "debeir.rankers", "kind": "module", "doc": "Rankers module.
\n\n\n Includes runnable out-of-box training code\n Custom ranking loss functions (e.g. LambdaLoss, NDCGLoss)\n Includes custom rankers for reranking or NIR-style queries.
\n \n"}, {"fullname": "debeir.rankers.reranking", "modulename": "debeir.rankers.reranking", "kind": "module", "doc": "
\n"}, {"fullname": "debeir.rankers.reranking.nir", "modulename": "debeir.rankers.reranking.nir", "kind": "module", "doc": "NIR Reranker
\n\n[Insert paper link here]
\n"}, {"fullname": "debeir.rankers.reranking.nir.NIReRanker", "modulename": "debeir.rankers.reranking.nir", "qualname": "NIReRanker", "kind": "class", "doc": "Re-ranker which uses the NIR scoring method\n score = log(bm25)/log(z) + cosine_sum
\n", "bases": "debeir.rankers.reranking.reranker.DocumentReRanker"}, {"fullname": "debeir.rankers.reranking.nir.NIReRanker.__init__", "modulename": "debeir.rankers.reranking.nir", "qualname": "NIReRanker.__init__", "kind": "function", "doc": "
\n", "signature": "(\tquery , \tranked_list : List [ debeir . core . document . Document ] , \tencoder : debeir . rankers . transformer_sent_encoder . Encoder , \tdistance_fn =< function cosine > , \tfacets_weights : Dict = None , \tpresort = False , \tfields_to_encode = None , \t* args , \t** kwargs ) "}, {"fullname": "debeir.rankers.reranking.reranker", "modulename": "debeir.rankers.reranking.reranker", "kind": "module", "doc": "General re-ranking interfaces to be implemented by child classes.
\n"}, {"fullname": "debeir.rankers.reranking.reranker.ReRanker", "modulename": "debeir.rankers.reranking.reranker", "qualname": "ReRanker", "kind": "class", "doc": "General interface for a reranking.
\n\nChild classes should implement the abstract methods.
\n"}, {"fullname": "debeir.rankers.reranking.reranker.ReRanker.__init__", "modulename": "debeir.rankers.reranking.reranker", "qualname": "ReRanker.__init__", "kind": "function", "doc": "
\n", "signature": "(query , ranked_list : List , * args , ** kwargs ) "}, {"fullname": "debeir.rankers.reranking.reranker.ReRanker.rerank", "modulename": "debeir.rankers.reranking.reranker", "qualname": "ReRanker.rerank", "kind": "function", "doc": "Re-rank the passed ranked list based on implemented private _compute_scores method.
\n\nParameters \n\n\n\nReturns \n\n\nA ranked list in descending order of the score field (which will be the last item in the list)\n
\n \n", "signature": "(self ) -> List : ", "funcdef": "def"}, {"fullname": "debeir.rankers.reranking.reranker.DocumentReRanker", "modulename": "debeir.rankers.reranking.reranker", "qualname": "DocumentReRanker", "kind": "class", "doc": "Reranking interface for a ranked list of Document objects.
\n", "bases": "ReRanker"}, {"fullname": "debeir.rankers.reranking.reranker.DocumentReRanker.__init__", "modulename": "debeir.rankers.reranking.reranker", "qualname": "DocumentReRanker.__init__", "kind": "function", "doc": "
\n", "signature": "(\tquery , \tranked_list : List [ debeir . core . document . Document ] , \t* args , \t** kwargs ) "}, {"fullname": "debeir.rankers.reranking.reranker.ReRankerPool", "modulename": "debeir.rankers.reranking.reranker", "qualname": "ReRankerPool", "kind": "class", "doc": "
\n"}, {"fullname": "debeir.rankers.reranking.reranker.ReRankerPool.__init__", "modulename": "debeir.rankers.reranking.reranker", "qualname": "ReRankerPool.__init__", "kind": "function", "doc": "
\n", "signature": "() "}, {"fullname": "debeir.rankers.reranking.use", "modulename": "debeir.rankers.reranking.use", "kind": "module", "doc": "
\n"}, {"fullname": "debeir.rankers.reranking.use.USEReRanker", "modulename": "debeir.rankers.reranking.use", "qualname": "USEReRanker", "kind": "class", "doc": "Re-ranks based on the cosine_sum rather the complete NIR scoring
\n", "bases": "debeir.rankers.reranking.nir.NIReRanker"}, {"fullname": "debeir.rankers.reranking.use.USEReRanker.__init__", "modulename": "debeir.rankers.reranking.use", "qualname": "USEReRanker.__init__", "kind": "function", "doc": "
\n", "signature": "(* args , ** kwargs ) "}, {"fullname": "debeir.rankers.transformer_sent_encoder", "modulename": "debeir.rankers.transformer_sent_encoder", "kind": "module", "doc": "
\n"}, {"fullname": "debeir.rankers.transformer_sent_encoder.Encoder", "modulename": "debeir.rankers.transformer_sent_encoder", "qualname": "Encoder", "kind": "class", "doc": "A wrapper for the Sentence Transformer Encoder used in Universal Sentence Embeddings (USE) for ranking or reranking.
\n\nParameters \n\n\nmodel_path : The path to a sentence transformer or transformer model. \nnormalize : Normalize the output vectors to unit length for dot product retrieval rather than cosine. \nspacy_model : the spacy or scispacy model to use for sentence boundary detection. \nmax_length : Maximum input length for the spacy nlp model. \n \n"}, {"fullname": "debeir.rankers.transformer_sent_encoder.Encoder.__init__", "modulename": "debeir.rankers.transformer_sent_encoder", "qualname": "Encoder.__init__", "kind": "function", "doc": "
\n", "signature": "(\tmodel_path , \tnormalize = False , \tspacy_model = 'en_core_sci_md' , \tmax_length = 2000000 ) "}, {"fullname": "debeir.rankers.transformer_sent_encoder.Encoder.encode", "modulename": "debeir.rankers.transformer_sent_encoder", "qualname": "Encoder.encode", "kind": "function", "doc": "Computes sentence embeddings for a given topic, uses spacy for sentence segmentation.\nBy default, uses a cache to store previously computed vectors. Pass \"disable_cache\" as a kwarg to disable this.
\n\nParameters \n\n\ntopic : The topic (a list of sentences) to encode. Should be a raw string. \ndisable_cache : keyword argument, pass as True to disable encoding caching. \n \n\nReturns \n\n\nReturns a list of encoded tensors is returned.\n
\n \n", "signature": "(self , topic : str ) -> List : ", "funcdef": "def"}, {"fullname": "debeir.training", "modulename": "debeir.training", "kind": "module", "doc": "Runnable out-of-the-box code for training re-rankers.
\n"}, {"fullname": "debeir.training.evaluate_reranker", "modulename": "debeir.training.evaluate_reranker", "kind": "module", "doc": "
\n"}, {"fullname": "debeir.training.evaluate_reranker.SentenceEvaluator", "modulename": "debeir.training.evaluate_reranker", "qualname": "SentenceEvaluator", "kind": "class", "doc": "Evaluation class for computing metrics from TREC-style files
\n", "bases": "debeir.evaluation.evaluator.Evaluator"}, {"fullname": "debeir.training.evaluate_reranker.SentenceEvaluator.__init__", "modulename": "debeir.training.evaluate_reranker", "qualname": "SentenceEvaluator.__init__", "kind": "function", "doc": "
\n", "signature": "(\tmodel : debeir . rankers . transformer_sent_encoder . Encoder , \tdataset : datasets . arrow_dataset . Dataset , \tparsed_topics : Dict [ Union [ str , int ], Dict ] , \ttext_cols : List [ str ] , \tquery_cols : List [ str ] , \tid_col : str , \tdistance_fn : str , \tqrels : str , \tmetrics : List [ str ] ) "}, {"fullname": "debeir.training.evaluate_reranker.SentenceEvaluator.produce_ranked_lists", "modulename": "debeir.training.evaluate_reranker", "qualname": "SentenceEvaluator.produce_ranked_lists", "kind": "function", "doc": "
\n", "signature": "(self ): ", "funcdef": "def"}, {"fullname": "debeir.training.hparm_tuning", "modulename": "debeir.training.hparm_tuning", "kind": "module", "doc": "Hyper parameter tuning library using Optuna and Wandb
\n"}, {"fullname": "debeir.training.hparm_tuning.config", "modulename": "debeir.training.hparm_tuning.config", "kind": "module", "doc": "
\n"}, {"fullname": "debeir.training.hparm_tuning.config.HparamConfig", "modulename": "debeir.training.hparm_tuning.config", "qualname": "HparamConfig", "kind": "class", "doc": "Hyperparameter configuration file
\n\nExpects a dictionary of hyperparameters
\n\nhparams: Dict\n{\n \"learning_rate\": {\n \"type\": float\n \"low\": 0.1\n \"high\": 1.0\n \"step\": 0.1\n # OR\n args: [0.1, 1.0, 0.1]\n },\n}
\n", "bases": "debeir.core.config.Config"}, {"fullname": "debeir.training.hparm_tuning.config.HparamConfig.__init__", "modulename": "debeir.training.hparm_tuning.config", "qualname": "HparamConfig.__init__", "kind": "function", "doc": "
\n", "signature": "(hparams : Dict [ str , Dict ] ) "}, {"fullname": "debeir.training.hparm_tuning.config.HparamConfig.from_json", "modulename": "debeir.training.hparm_tuning.config", "qualname": "HparamConfig.from_json", "kind": "function", "doc": "
\n", "signature": "(cls , fp ) -> debeir . training . hparm_tuning . config . HparamConfig : ", "funcdef": "def"}, {"fullname": "debeir.training.hparm_tuning.config.HparamConfig.validate", "modulename": "debeir.training.hparm_tuning.config", "qualname": "HparamConfig.validate", "kind": "function", "doc": "Validates if the config is correct.\nMust be implemented by inherited classes.
\n", "signature": "(self ): ", "funcdef": "def"}, {"fullname": "debeir.training.hparm_tuning.config.HparamConfig.parse_config_to_py", "modulename": "debeir.training.hparm_tuning.config", "qualname": "HparamConfig.parse_config_to_py", "kind": "function", "doc": "Parses configuration file into usable python objects
\n", "signature": "(self ): ", "funcdef": "def"}, {"fullname": "debeir.training.hparm_tuning.optuna_rank", "modulename": "debeir.training.hparm_tuning.optuna_rank", "kind": "module", "doc": "
\n"}, {"fullname": "debeir.training.hparm_tuning.optuna_rank.objective", "modulename": "debeir.training.hparm_tuning.optuna_rank", "qualname": "objective", "kind": "function", "doc": "
\n", "signature": "(\ttrainer : debeir . training . hparm_tuning . trainer . Trainer , \ttrial : optuna . trial . _trial . Trial ): ", "funcdef": "def"}, {"fullname": "debeir.training.hparm_tuning.optuna_rank.run_optuna_with_wandb", "modulename": "debeir.training.hparm_tuning.optuna_rank", "qualname": "run_optuna_with_wandb", "kind": "function", "doc": "Partially initialize the objective function with a trainer and hparams to optimize.
\n\nOptimize using the optuna library.
\n\nParameters \n\n\ntrainer : \nn_trials : \nmaximize_objective : \nwandb_kwargs : \n \n\nReturns \n", "signature": "(\ttrainer , \tn_trials = 100 , \tn_jobs = 1 , \tmaximize_objective = True , \tsave_study_path = '.' , \twandb_kwargs = None ): ", "funcdef": "def"}, {"fullname": "debeir.training.hparm_tuning.optuna_rank.print_optuna_stats", "modulename": "debeir.training.hparm_tuning.optuna_rank", "qualname": "print_optuna_stats", "kind": "function", "doc": "
\n", "signature": "(study : optuna . study . study . Study ): ", "funcdef": "def"}, {"fullname": "debeir.training.hparm_tuning.trainer", "modulename": "debeir.training.hparm_tuning.trainer", "kind": "module", "doc": "
\n"}, {"fullname": "debeir.training.hparm_tuning.trainer.OptimizersWrapper", "modulename": "debeir.training.hparm_tuning.trainer", "qualname": "OptimizersWrapper", "kind": "class", "doc": "
\n"}, {"fullname": "debeir.training.hparm_tuning.trainer.OptimizersWrapper.__init__", "modulename": "debeir.training.hparm_tuning.trainer", "qualname": "OptimizersWrapper.__init__", "kind": "function", "doc": "
\n", "signature": "() "}, {"fullname": "debeir.training.hparm_tuning.trainer.Trainer", "modulename": "debeir.training.hparm_tuning.trainer", "qualname": "Trainer", "kind": "class", "doc": "Wrapper class for a trainer class.
\n"}, {"fullname": "debeir.training.hparm_tuning.trainer.Trainer.__init__", "modulename": "debeir.training.hparm_tuning.trainer", "qualname": "Trainer.__init__", "kind": "function", "doc": "
\n", "signature": "(model , evaluator_fn , dataset_loading_fn ) "}, {"fullname": "debeir.training.hparm_tuning.trainer.Trainer.fit", "modulename": "debeir.training.hparm_tuning.trainer", "qualname": "Trainer.fit", "kind": "function", "doc": "
\n", "signature": "(\tself , \tin_trial : optuna . trial . _trial . Trial , \ttrain_dataset , \tval_dataset ): ", "funcdef": "def"}, {"fullname": "debeir.training.hparm_tuning.trainer.SentenceTransformerHparamTrainer", "modulename": "debeir.training.hparm_tuning.trainer", "qualname": "SentenceTransformerHparamTrainer", "kind": "class", "doc": "See Optuna documentation for types!
\n", "bases": "Trainer"}, {"fullname": "debeir.training.hparm_tuning.trainer.SentenceTransformerHparamTrainer.__init__", "modulename": "debeir.training.hparm_tuning.trainer", "qualname": "SentenceTransformerHparamTrainer.__init__", "kind": "function", "doc": "
\n", "signature": "(\tdataset_loading_fn , \tevaluator_fn , \thparams_config : debeir . training . hparm_tuning . config . HparamConfig ) "}, {"fullname": "debeir.training.hparm_tuning.trainer.SentenceTransformerHparamTrainer.get_optuna_hparams", "modulename": "debeir.training.hparm_tuning.trainer", "qualname": "SentenceTransformerHparamTrainer.get_optuna_hparams", "kind": "function", "doc": "Get hyperparameters suggested by the optuna library
\n\nParameters \n\n\ntrial : The optuna trial object \nhparams : Optional, pass a dictionary of HparamType[Enum] objects \n \n\nReturns \n", "signature": "(\tself , \ttrial : optuna . trial . _trial . Trial , \thparams : Sequence [ debeir . training . hparm_tuning . types . Hparam ] = None ): ", "funcdef": "def"}, {"fullname": "debeir.training.hparm_tuning.trainer.SentenceTransformerHparamTrainer.build_kwargs_and_model", "modulename": "debeir.training.hparm_tuning.trainer", "qualname": "SentenceTransformerHparamTrainer.build_kwargs_and_model", "kind": "function", "doc": "
\n", "signature": "(self , hparams : Dict ): ", "funcdef": "def"}, {"fullname": "debeir.training.hparm_tuning.trainer.SentenceTransformerHparamTrainer.fit", "modulename": "debeir.training.hparm_tuning.trainer", "qualname": "SentenceTransformerHparamTrainer.fit", "kind": "function", "doc": "
\n", "signature": "(\tself , \tin_trial : optuna . trial . _trial . Trial , \ttrain_dataset , \tval_dataset ): ", "funcdef": "def"}, {"fullname": "debeir.training.hparm_tuning.trainer.trial_callback", "modulename": "debeir.training.hparm_tuning.trainer", "qualname": "trial_callback", "kind": "function", "doc": "
\n", "signature": "(trial , score , epoch , * args , ** kwargs ): ", "funcdef": "def"}, {"fullname": "debeir.training.hparm_tuning.trainer.SentenceTransformerTrainer", "modulename": "debeir.training.hparm_tuning.trainer", "qualname": "SentenceTransformerTrainer", "kind": "class", "doc": "See Optuna documentation for types!
\n", "bases": "SentenceTransformerHparamTrainer"}, {"fullname": "debeir.training.hparm_tuning.trainer.SentenceTransformerTrainer.__init__", "modulename": "debeir.training.hparm_tuning.trainer", "qualname": "SentenceTransformerTrainer.__init__", "kind": "function", "doc": "
\n", "signature": "(\tdataset : Union [ datasets . dataset_dict . DatasetDict , Dict [ str , datasets . arrow_dataset . Dataset ]] , \thparams_config : debeir . training . hparm_tuning . config . HparamConfig , \tevaluator_fn = None , \tevaluator = None , \tuse_wandb = False ) "}, {"fullname": "debeir.training.hparm_tuning.trainer.SentenceTransformerTrainer.fit", "modulename": "debeir.training.hparm_tuning.trainer", "qualname": "SentenceTransformerTrainer.fit", "kind": "function", "doc": "
\n", "signature": "(self , ** extra_kwargs ): ", "funcdef": "def"}, {"fullname": "debeir.training.hparm_tuning.types", "modulename": "debeir.training.hparm_tuning.types", "kind": "module", "doc": "
\n"}, {"fullname": "debeir.training.hparm_tuning.types.Hparam", "modulename": "debeir.training.hparm_tuning.types", "qualname": "Hparam", "kind": "class", "doc": "
\n"}, {"fullname": "debeir.training.hparm_tuning.types.Hparam.__init__", "modulename": "debeir.training.hparm_tuning.types", "qualname": "Hparam.__init__", "kind": "function", "doc": "
\n", "signature": "() "}, {"fullname": "debeir.training.hparm_tuning.types.Hparam.suggest", "modulename": "debeir.training.hparm_tuning.types", "qualname": "Hparam.suggest", "kind": "function", "doc": "
\n", "signature": "(self , * args , ** kwargs ): ", "funcdef": "def"}, {"fullname": "debeir.training.hparm_tuning.types.HparamFloat", "modulename": "debeir.training.hparm_tuning.types", "qualname": "HparamFloat", "kind": "class", "doc": "
\n", "bases": "Hparam"}, {"fullname": "debeir.training.hparm_tuning.types.HparamFloat.__init__", "modulename": "debeir.training.hparm_tuning.types", "qualname": "HparamFloat.__init__", "kind": "function", "doc": "
\n", "signature": "(\tname : str , \tlow : float , \thigh : float , \tlog : bool = False , \tstep : float = None ) "}, {"fullname": "debeir.training.hparm_tuning.types.HparamFloat.suggest", "modulename": "debeir.training.hparm_tuning.types", "qualname": "HparamFloat.suggest", "kind": "function", "doc": "
\n", "signature": "(self , trial : optuna . trial . _trial . Trial ): ", "funcdef": "def"}, {"fullname": "debeir.training.hparm_tuning.types.HparamInt", "modulename": "debeir.training.hparm_tuning.types", "qualname": "HparamInt", "kind": "class", "doc": "
\n", "bases": "Hparam"}, {"fullname": "debeir.training.hparm_tuning.types.HparamInt.__init__", "modulename": "debeir.training.hparm_tuning.types", "qualname": "HparamInt.__init__", "kind": "function", "doc": "
\n", "signature": "(name : str , low : int , high : int , log : bool = False , step : int = 1 ) "}, {"fullname": "debeir.training.hparm_tuning.types.HparamInt.suggest", "modulename": "debeir.training.hparm_tuning.types", "qualname": "HparamInt.suggest", "kind": "function", "doc": "
\n", "signature": "(self , trial : optuna . trial . _trial . Trial ): ", "funcdef": "def"}, {"fullname": "debeir.training.hparm_tuning.types.HparamCategorical", "modulename": "debeir.training.hparm_tuning.types", "qualname": "HparamCategorical", "kind": "class", "doc": "
\n", "bases": "Hparam"}, {"fullname": "debeir.training.hparm_tuning.types.HparamCategorical.__init__", "modulename": "debeir.training.hparm_tuning.types", "qualname": "HparamCategorical.__init__", "kind": "function", "doc": "
\n", "signature": "(name : str , choices : Sequence , func : str = 'suggest_categorical' ) "}, {"fullname": "debeir.training.hparm_tuning.types.HparamCategorical.suggest", "modulename": "debeir.training.hparm_tuning.types", "qualname": "HparamCategorical.suggest", "kind": "function", "doc": "
\n", "signature": "(self , trial : optuna . trial . _trial . Trial ): ", "funcdef": "def"}, {"fullname": "debeir.training.hparm_tuning.types.HparamUniform", "modulename": "debeir.training.hparm_tuning.types", "qualname": "HparamUniform", "kind": "class", "doc": "
\n", "bases": "Hparam"}, {"fullname": "debeir.training.hparm_tuning.types.HparamUniform.__init__", "modulename": "debeir.training.hparm_tuning.types", "qualname": "HparamUniform.__init__", "kind": "function", "doc": "
\n", "signature": "(name : str , low : float , high : float , func : str = 'suggest_uniform' ) "}, {"fullname": "debeir.training.hparm_tuning.types.HparamUniform.suggest", "modulename": "debeir.training.hparm_tuning.types", "qualname": "HparamUniform.suggest", "kind": "function", "doc": "
\n", "signature": "(self , trial : optuna . trial . _trial . Trial ): ", "funcdef": "def"}, {"fullname": "debeir.training.hparm_tuning.types.HparamLogUniform", "modulename": "debeir.training.hparm_tuning.types", "qualname": "HparamLogUniform", "kind": "class", "doc": "
\n", "bases": "Hparam"}, {"fullname": "debeir.training.hparm_tuning.types.HparamLogUniform.__init__", "modulename": "debeir.training.hparm_tuning.types", "qualname": "HparamLogUniform.__init__", "kind": "function", "doc": "
\n", "signature": "(name : str , low : float , high : float , func : str = 'suggest_loguniform' ) "}, {"fullname": "debeir.training.hparm_tuning.types.HparamLogUniform.suggest", "modulename": "debeir.training.hparm_tuning.types", "qualname": "HparamLogUniform.suggest", "kind": "function", "doc": "
\n", "signature": "(self , trial : optuna . trial . _trial . Trial ): ", "funcdef": "def"}, {"fullname": "debeir.training.hparm_tuning.types.HparamDiscreteUniform", "modulename": "debeir.training.hparm_tuning.types", "qualname": "HparamDiscreteUniform", "kind": "class", "doc": "
\n", "bases": "Hparam"}, {"fullname": "debeir.training.hparm_tuning.types.HparamDiscreteUniform.__init__", "modulename": "debeir.training.hparm_tuning.types", "qualname": "HparamDiscreteUniform.__init__", "kind": "function", "doc": "
\n", "signature": "(\tname : str , \tlow : float , \thigh : float , \tq : float , \tfunc : str = 'suggest_discrete_uniform' ) "}, {"fullname": "debeir.training.hparm_tuning.types.HparamDiscreteUniform.suggest", "modulename": "debeir.training.hparm_tuning.types", "qualname": "HparamDiscreteUniform.suggest", "kind": "function", "doc": "
\n", "signature": "(self , trial : optuna . trial . _trial . Trial ): ", "funcdef": "def"}, {"fullname": "debeir.training.losses", "modulename": "debeir.training.losses", "kind": "module", "doc": "
\n"}, {"fullname": "debeir.training.losses.contrastive", "modulename": "debeir.training.losses.contrastive", "kind": "module", "doc": "Author: Yonglong Tian (yonglong@mit.edu)\nDate: May 07, 2020
\n\nCode imported from: https://github.com/HobbitLong/SupContrast/blob/master/losses.py
\n"}, {"fullname": "debeir.training.losses.contrastive.SupConLoss", "modulename": "debeir.training.losses.contrastive", "qualname": "SupConLoss", "kind": "class", "doc": "Supervised Contrastive Learning: https://arxiv.org/pdf/2004.11362.pdf .\nIt also supports the unsupervised contrastive loss in SimCLR
\n", "bases": "torch.nn.modules.module.Module"}, {"fullname": "debeir.training.losses.contrastive.SupConLoss.__init__", "modulename": "debeir.training.losses.contrastive", "qualname": "SupConLoss.__init__", "kind": "function", "doc": "Initializes internal Module state, shared by both nn.Module and ScriptModule.
\n", "signature": "(temperature = 1.0 , contrast_mode = 'all' , base_temperature = 1.0 ) "}, {"fullname": "debeir.training.losses.contrastive.SupConLoss.forward", "modulename": "debeir.training.losses.contrastive", "qualname": "SupConLoss.forward", "kind": "function", "doc": "Compute loss for model. If both labels
and mask
are None,\nit degenerates to SimCLR unsupervised loss:\nhttps://arxiv.org/pdf/2002.05709.pdf \nArgs:\n features: hidden vector of shape [bsz, n_views, ...].\n labels: ground truth of shape [bsz].\n mask: contrastive mask of shape [bsz, bsz], mask_{i,j}=1 if sample j\n has the same class as sample i. Can be asymmetric.\nReturns:\n A loss scalar.
\n", "signature": "(self , features , labels = None , mask = None ): ", "funcdef": "def"}, {"fullname": "debeir.training.losses.contrastive.SiameseDistanceMetric", "modulename": "debeir.training.losses.contrastive", "qualname": "SiameseDistanceMetric", "kind": "class", "doc": "The metric for the contrastive loss
\n", "bases": "enum.Enum"}, {"fullname": "debeir.training.losses.contrastive.SiameseDistanceMetric.EUCLIDEAN", "modulename": "debeir.training.losses.contrastive", "qualname": "SiameseDistanceMetric.EUCLIDEAN", "kind": "function", "doc": "
\n", "signature": "(x , y ): ", "funcdef": "def"}, {"fullname": "debeir.training.losses.contrastive.SiameseDistanceMetric.MANHATTAN", "modulename": "debeir.training.losses.contrastive", "qualname": "SiameseDistanceMetric.MANHATTAN", "kind": "function", "doc": "
\n", "signature": "(x , y ): ", "funcdef": "def"}, {"fullname": "debeir.training.losses.contrastive.SiameseDistanceMetric.COSINE_DISTANCE", "modulename": "debeir.training.losses.contrastive", "qualname": "SiameseDistanceMetric.COSINE_DISTANCE", "kind": "function", "doc": "
\n", "signature": "(x , y ): ", "funcdef": "def"}, {"fullname": "debeir.training.losses.contrastive.ContrastiveSentLoss", "modulename": "debeir.training.losses.contrastive", "qualname": "ContrastiveSentLoss", "kind": "class", "doc": "Contrastive loss. Expects as input two texts and a label of either 0 or 1. If the label == 1, then the distance between the\ntwo embeddings is reduced. If the label == 0, then the distance between the embeddings is increased.\nFurther information: http://yann.lecun.com/exdb/publis/pdf/hadsell-chopra-lecun-06.pdf
\n\nParameters \n\n\nmodel : SentenceTransformer model \ndistance_metric : Function that returns a distance between two emeddings. The class SiameseDistanceMetric contains pre-defined metrices that can be used \nmargin : Negative samples (label == 0) should have a distance of at least the margin value. \nsize_average : Average by the size of the mini-batch.\nExample::\nfrom sentence_transformers import SentenceTransformer, LoggingHandler, losses, InputExample\nfrom torch.utils.data import DataLoader\nmodel = SentenceTransformer('all-MiniLM-L6-v2')\ntrain_examples = [\n InputExample(texts=['This is a positive pair', 'Where the distance will be minimized'], label=1),\n InputExample(texts=['This is a negative pair', 'Their distance will be increased'], label=0)]\ntrain_dataloader = DataLoader(train_examples, shuffle=True, batch_size=2)\ntrain_loss = losses.ContrastiveLoss(model=model)\nmodel.fit([(train_dataloader, train_loss)], show_progress_bar=True) \n \n", "bases": "torch.nn.modules.module.Module"}, {"fullname": "debeir.training.losses.contrastive.ContrastiveSentLoss.__init__", "modulename": "debeir.training.losses.contrastive", "qualname": "ContrastiveSentLoss.__init__", "kind": "function", "doc": "Initializes internal Module state, shared by both nn.Module and ScriptModule.
\n", "signature": "(\tmodel , \tdistance_metric =< function SiameseDistanceMetric .< lambda >> , \tmargin : float = 0.5 , \tsize_average : bool = True ) "}, {"fullname": "debeir.training.losses.contrastive.ContrastiveSentLoss.get_config_dict", "modulename": "debeir.training.losses.contrastive", "qualname": "ContrastiveSentLoss.get_config_dict", "kind": "function", "doc": "
\n", "signature": "(self ): ", "funcdef": "def"}, {"fullname": "debeir.training.losses.contrastive.ContrastiveSentLoss.forward", "modulename": "debeir.training.losses.contrastive", "qualname": "ContrastiveSentLoss.forward", "kind": "function", "doc": "Defines the computation performed at every call.
\n\nShould be overridden by all subclasses.
\n\n\n\n
Although the recipe for forward pass needs to be defined within\nthis function, one should call the Module
instance afterwards\ninstead of this since the former takes care of running the\nregistered hooks while the latter silently ignores them.
\n\n
\n", "signature": "(\tself , \tsentence_features : Iterable [ Dict [ str , torch . Tensor ]] , \tlabels : torch . Tensor ): ", "funcdef": "def"}, {"fullname": "debeir.training.losses.ranking", "modulename": "debeir.training.losses.ranking", "kind": "module", "doc": "Losses are drawn from the allrank library
\n"}, {"fullname": "debeir.training.train_reranker", "modulename": "debeir.training.train_reranker", "kind": "module", "doc": "
\n"}, {"fullname": "debeir.training.train_reranker.train_cross_encoder_reranker", "modulename": "debeir.training.train_reranker", "qualname": "train_cross_encoder_reranker", "kind": "function", "doc": "Trains a reranker with relevance signals
\n\nParameters \n\n\nmodel_fp_or_name : The model name or path to the model \noutput_dir : Output directory to save model, logs etc. \ntrain_dataset : Training Examples \ndev_dataset : Dev examples \ntrain_batch_size : Training batch size \nnum_epochs : Number of epochs \nwarmup_steps : Warmup steps for the scheduler \nevaluate_every_n_step : Evaluate the model every n steps \nspecial_tokens : Special tokens to add, defaults to [DOC], [QRY] tokens (bi-encoder) \npooling_mode : Pooling mode for a sentence transformer model \nloss_func : Loss function(s) to use \nevaluator : Evaluator to use \n \n", "signature": "(\tmodel_fp_or_name : str , \toutput_dir : str , \ttrain_dataset : List [ debeir . datasets . types . RelevanceExample ] , \tdev_dataset : List [ debeir . datasets . types . RelevanceExample ] , \ttrain_batch_size = 32 , \tnum_epochs = 3 , \twarmup_steps = None , \tevaluate_every_n_step : int = 1000 , \tspecial_tokens = None , \tpooling_mode = None , \tloss_func = None , \tevaluator : sentence_transformers . evaluation . SentenceEvaluator . SentenceEvaluator = None , \t* args , \t** kwargs ): ", "funcdef": "def"}, {"fullname": "debeir.training.train_sentence_encoder", "modulename": "debeir.training.train_sentence_encoder", "kind": "module", "doc": "
\n"}, {"fullname": "debeir.training.train_sentence_encoder.train_biencoder", "modulename": "debeir.training.train_sentence_encoder", "qualname": "train_biencoder", "kind": "function", "doc": "Train a universal sentence encoder
\n\nParameters \n\n\nmodel_fp_or_name : The model name or path to the model \noutput_dir : Output directory to save model, logs etc. \ntrain_examples : Training Examples \ndev_examples : Dev examples \ntrain_batch_size : Training batch size \nnum_epochs : Number of epochs \nwarmup_steps : Warmup steps for the scheduler \nevaluate_every_n_step : Evaluate the model every n steps \nspecial_tokens : Special tokens to add \npooling_mode : Pooling mode for a sentence transformer model \nloss_func : Loss function(s) to use \nevaluator : Evaluator to use \n \n", "signature": "(\tmodel_fp_or_name : str , \toutput_dir : str , \ttrain_examples : List [ debeir . datasets . types . InputExample ] , \tdev_examples : List [ debeir . datasets . types . InputExample ] , \ttrain_batch_size = 32 , \tnum_epochs = 3 , \twarmup_steps = None , \tevaluate_every_n_step : int = 1000 , \tspecial_tokens = None , \tpooling_mode = None , \tloss_func = None , \tevaluator : sentence_transformers . evaluation . SentenceEvaluator . SentenceEvaluator = None , \t* args , \t** kwargs ): ", "funcdef": "def"}, {"fullname": "debeir.training.train_sentence_encoder.train_huggingface_transformer", "modulename": "debeir.training.train_sentence_encoder", "qualname": "train_huggingface_transformer", "kind": "function", "doc": "Train a transformer model using the Huggingface API
\n\nParameters \n\n\nmodel_fp_or_name_or_cls : Model name or model class to instantiate \ntokenizer : Tokenizer \noutput_dir : Output directory to write to \ncompute_metric_fn : Metric function to compute metrics \nmetric : Metric used by the compute_metric_fn \ndataset : Huggingface Dataset Dict \ntrain_dataset : Training dataset to be used by the Trainer class \neval_dataset : Evaluation dataset to be used by the Trainer class \ntrain_batch_size : Batch size to use for training \nnum_epochs: Number of training epochs (default : 3) \nlearning_rate: Learning rate (default : 5e-5) \nlr_scheduler_type : Learning rate type, see SchedulerType \noptimizer : Optimizer \nwarmup_ratio : Warmup ratios as ratio of steps (default 0.1) \nevaluate_every_n_step : Number of steps to evaluate \npooling_mode : Pooling mode for your model \nloss_func : Loss function to instantiate model \nmodel_args : Model arguments to pass \nmodel_kwargs : Model keyword arguments \npadding_strategy : Tokenization padding strategy \ntruncate : Truncate tokenization strategy \nspecial_tokens : Special tokens to add to the tokenizer \nseed : Dataset shuffle seed \nargs : \nkwargs : \n \n\nReturns \n", "signature": "(\tmodel_fp_or_name_or_cls : Union [ str , transformers . modeling_utils . PreTrainedModel ] , \ttokenizer : transformers . tokenization_utils . PreTrainedTokenizer , \toutput_dir : str , \tcompute_metric_fn , \tmetric : datasets . metric . Metric , \tdataset : datasets . dataset_dict . DatasetDict = None , \ttrain_dataset : List [ Union [ debeir . datasets . types . RelevanceExample , debeir . datasets . types . InputExample , datasets . arrow_dataset . Dataset ]] = None , \teval_dataset : List [ Union [ debeir . datasets . types . RelevanceExample , debeir . datasets . types . InputExample , datasets . arrow_dataset . Dataset ]] = None , \ttrain_batch_size = 32 , \tnum_epochs = 3 , \tlearning_rate = 5e-05 , \tlr_scheduler_type : transformers . trainer_utils . SchedulerType = < SchedulerType . CONSTANT_WITH_WARMUP : 'constant_with_warmup' > , \toptimizer : str = 'adamw_hf' , \twarmup_ratio = 0.1 , \tevaluate_every_n_step : int = 1000 , \tpooling_mode = None , \tloss_func = None , \tmodel_args = None , \tmodel_kwargs = None , \tpadding_strategy = 'max_length' , \ttruncate = True , \tspecial_tokens = None , \tseed = 42 , \t* args , \t** kwargs ) -> transformers . trainer . Trainer : ", "funcdef": "def"}, {"fullname": "debeir.training.utils", "modulename": "debeir.training.utils", "kind": "module", "doc": "
\n"}, {"fullname": "debeir.training.utils.LoggingScheduler", "modulename": "debeir.training.utils", "qualname": "LoggingScheduler", "kind": "class", "doc": "
\n"}, {"fullname": "debeir.training.utils.LoggingScheduler.__init__", "modulename": "debeir.training.utils", "qualname": "LoggingScheduler.__init__", "kind": "function", "doc": "
\n", "signature": "(scheduler : torch . optim . lr_scheduler . LambdaLR ) "}, {"fullname": "debeir.training.utils.LoggingScheduler.step", "modulename": "debeir.training.utils", "qualname": "LoggingScheduler.step", "kind": "function", "doc": "
\n", "signature": "(self , epoch = None ): ", "funcdef": "def"}, {"fullname": "debeir.training.utils.get_scheduler_with_wandb", "modulename": "debeir.training.utils", "qualname": "get_scheduler_with_wandb", "kind": "function", "doc": "Returns the correct learning rate scheduler. Available scheduler: constantlr, warmupconstant, warmuplinear, warmupcosine, warmupcosinewithhardrestarts
\n", "signature": "(optimizer , scheduler : str , warmup_steps : int , t_total : int ): ", "funcdef": "def"}, {"fullname": "debeir.training.utils.LoggingLoss", "modulename": "debeir.training.utils", "qualname": "LoggingLoss", "kind": "class", "doc": "
\n"}, {"fullname": "debeir.training.utils.LoggingLoss.__init__", "modulename": "debeir.training.utils", "qualname": "LoggingLoss.__init__", "kind": "function", "doc": "
\n", "signature": "(loss_fn ) "}, {"fullname": "debeir.training.utils.TokenizerOverload", "modulename": "debeir.training.utils", "qualname": "TokenizerOverload", "kind": "class", "doc": "
\n"}, {"fullname": "debeir.training.utils.TokenizerOverload.__init__", "modulename": "debeir.training.utils", "qualname": "TokenizerOverload.__init__", "kind": "function", "doc": "
\n", "signature": "(tokenizer , tokenizer_kwargs , debug = False ) "}, {"fullname": "debeir.training.utils.LoggingEvaluator", "modulename": "debeir.training.utils", "qualname": "LoggingEvaluator", "kind": "class", "doc": "
\n"}, {"fullname": "debeir.training.utils.LoggingEvaluator.__init__", "modulename": "debeir.training.utils", "qualname": "LoggingEvaluator.__init__", "kind": "function", "doc": "
\n", "signature": "(evaluator ) "}, {"fullname": "debeir.training.utils.SentDataset", "modulename": "debeir.training.utils", "qualname": "SentDataset", "kind": "class", "doc": "
\n"}, {"fullname": "debeir.training.utils.SentDataset.__init__", "modulename": "debeir.training.utils", "qualname": "SentDataset.__init__", "kind": "function", "doc": "
\n", "signature": "(\tdataset : datasets . arrow_dataset . Dataset , \ttext_cols : List [ str ] , \tlabel_col : str = None , \tlabel = None ) "}, {"fullname": "debeir.training.utils.SentDatasetList", "modulename": "debeir.training.utils", "qualname": "SentDatasetList", "kind": "class", "doc": "
\n"}, {"fullname": "debeir.training.utils.SentDatasetList.__init__", "modulename": "debeir.training.utils", "qualname": "SentDatasetList.__init__", "kind": "function", "doc": "
\n", "signature": "(datasets : List [ debeir . training . utils . SentDataset ] ) "}, {"fullname": "debeir.training.utils.tokenize_function", "modulename": "debeir.training.utils", "qualname": "tokenize_function", "kind": "function", "doc": "Tokenizer function
\n\nParameters \n\n\ntokenizer : Tokenizer \nexamples : Input examples to tokenize \npadding_strategy : Padding strategy \ntruncate : Truncate sentences \n \n\nReturns \n\n\nReturns a list of tokenized examples\n
\n \n", "signature": "(tokenizer , examples , padding_strategy , truncate ): ", "funcdef": "def"}, {"fullname": "debeir.training.utils.get_max_seq_length", "modulename": "debeir.training.utils", "qualname": "get_max_seq_length", "kind": "function", "doc": "
\n", "signature": "(tokenizer , dataset , x_labels , dataset_key = 'train' ): ", "funcdef": "def"}, {"fullname": "debeir.utils", "modulename": "debeir.utils", "kind": "module", "doc": "Common utilities such as score normalization and creating output directory w/ checks
\n"}, {"fullname": "debeir.utils.scaler", "modulename": "debeir.utils.scaler", "kind": "module", "doc": "
\n"}, {"fullname": "debeir.utils.scaler.unpack_elasticsearch_scores", "modulename": "debeir.utils.scaler", "qualname": "unpack_elasticsearch_scores", "kind": "function", "doc": "Helper function to retrieve the top score of documents for each topic.\nUsed in NIR weight adjustment calculation.
\n\nParameters \n\n\nresults : Raw input of results from Elasticsearch library \n \n\nReturns \n\n\nReturns a 1-D dictionary of {topic_num: top_score} pairs.\n
\n \n", "signature": "(results ) -> Dict : ", "funcdef": "def"}, {"fullname": "debeir.utils.scaler.get_z_value", "modulename": "debeir.utils.scaler", "qualname": "get_z_value", "kind": "function", "doc": "Analytical solution for the normalization constant, z, used in NIR log normalization.
\n\nParameters \n\n\ncosine_ceiling : The highest theoretical additive cosine score \nbm25_ceiling : The highest BM25 score retrieved from a given topic OR an estimate. \n \n\nReturns \n\n\nThe normalization parameter for NIR log normalization.\n
\n \n", "signature": "(cosine_ceiling , bm25_ceiling ) -> float : ", "funcdef": "def"}, {"fullname": "debeir.utils.utils", "modulename": "debeir.utils.utils", "kind": "module", "doc": "
\n"}, {"fullname": "debeir.utils.utils.create_output_file", "modulename": "debeir.utils.utils", "qualname": "create_output_file", "kind": "function", "doc": "Create output file based on config instructions
\n\nParameters \n\n\nconfig : The config object with output file options. \nconfig_fp : The config file path used in default naming options for the output file. \nremove : Overwrites the output file if it exists \noutput_file : The output file path if it exists \noutput_directory : The output directory used for default naming (specified in nir config) \nkwargs : Compatibility arguments \n \n\nReturns \n", "signature": "(config , config_fp , remove , output_file , output_directory , ** kwargs ): ", "funcdef": "def"}, {"fullname": "debeir.utils.utils.unpack_coroutine", "modulename": "debeir.utils.utils", "qualname": "unpack_coroutine", "kind": "function", "doc": "Recursively unwraps co-routines until a result is reached.
\n\nParameters \n\n\nf : Wrapped co-routine function. \n \n\nReturns \n\n\nResults from the (final) evaluated co-routine.\n
\n \n", "signature": "(f ): ", "funcdef": "async def"}, {"fullname": "debeir.utils.utils.flatten", "modulename": "debeir.utils.utils", "qualname": "flatten", "kind": "function", "doc": "Flattens a multidimensional dictionary (dictionary of dictionaries) to a single layer with child keys seperated by\n\"sep\"
\n\nParameters \n\n\nd : Multi-level dictionary to flatten. \nparent_key : Prepend a parent_key to all layers. \nsep : Seperator token between child and parent layers. \n \n\nReturns \n\n\nA flattened 1-D dictionary with keys seperated by *sep*.\n
\n \n", "signature": "(d , parent_key = '' , sep = '_' ): ", "funcdef": "def"}, {"fullname": "debeir.utils.utils.remove_excess_whitespace", "modulename": "debeir.utils.utils", "qualname": "remove_excess_whitespace", "kind": "function", "doc": "
\n", "signature": "(s ): ", "funcdef": "def"}];
// mirrored in build-search-index.js (part 1)
// Also split on html tags. this is a cheap heuristic, but good enough.