Skip to content

Latest commit

 

History

History
128 lines (106 loc) · 5.88 KB

django-full-text-search.md

File metadata and controls

128 lines (106 loc) · 5.88 KB

django.contrib.postgres.search

因为对中文分词不好,暂时无法有效使用。

from functools import reduce

from django.contrib.postgres.search import SearchVector, SearchQuery, SearchRank, SearchHeadline, TrigramSimilarity
from django.core.management.base import BaseCommand
from django.db.models import Q
from django.db.models.sql import AND

from ml.models import Tw, Jf


class Command(BaseCommand):
    help = 'test search'

    def handle(self, *args, **options):
        tws0 = Tw.objects.filter(content__search='脉浮 or 汗出')
        # tws1 = Tw.objects.annotate(search=SearchVector('content', 'footnote')).filter(search='脉浮 -汗出')

        # query = SearchQuery("脉浮 汗出", search_type='websearch')
        # tws2 = Tw.objects.annotate(search2=SearchVector('content', 'footnote'),
        #                            headline1=SearchHeadline('content', query, start_sel='<span>',
        #                                                     stop_sel='</span>')).filter(search2=query)
        # vector = SearchVector('content')
        # query = SearchQuery('脉浮')
        # tws3 = Tw.objects.annotate(rank=SearchRank(vector, query)).order_by('-rank')
        print(len(tws0), tws0)
        # print(len(tws1), tws1)
        # print(len(tws2), tws2)
        # print(len(tws3), tws3)

        # for tw2 in tws2:
        #     print(tw2.headline1)

        # jfs0 = Jf.objects.filter(prescription__search='桂枝')
        # jfs1 = Jf.objects.filter(prescription__icontains='桂枝')
        # jfs2 = Jf.objects.annotate(search=SearchVector('prescription', 'usage')).filter(search='桂枝')
        #
        # query2 = SearchQuery('甘草', search_type='websearch')
        # jfs = Jf.objects.annotate(search2=SearchVector('name', config='english'),
        #                           h_prescription=SearchHeadline('name', query2, start_sel='<span>',
        #                                                         stop_sel='</span>')).filter(search2=query2)
        # print(len(jfs0), jfs0)
        # print(len(jfs1), jfs1)
        # print(len(jfs2), jfs2)
        # print(len(jfs), jfs)
        # for jf in jfs:
        #     print(jf.h_prescription)

        # test_query = '桂枝'
        # test_jfs = Jf.objects.annotate(
        #     similarity=TrigramSimilarity('name', test_query),
        # ).filter(similarity__gt=0.3).order_by('-similarity')
        #
        # print(len(test_jfs), test_jfs)

        # query11 = SearchQuery('少阴病 手足寒', search_type='websearch')
        # test_tws = Tw.objects.annotate(headline1=SearchHeadline('content', query11, start_sel='<span>',
        #                                                         stop_sel='</span>')).filter(content__icontains='少阴病')
        # print(len(test_tws), test_tws)
        # for tw in test_tws:
        #     print(tw.headline1)
        # print(query11)
        
        # 全文检索最大的问题是`SELECT to_tsvector('桂枝汤'); `,也就是django的SearchVector无法分词。
        # 就算分词,又无法精确匹配。作为伤寒论这种古文,精确更重要。

        q = '汗 '
        q_list = filter(None, q.split(' '))
        print(q_list)
        s = reduce(lambda q1, q2: q1 & q2, [Q(content__contains=t) for t in q_list])
        print(s)
        tw_list = Tw.objects.filter(s)
        print(len(tw_list), tw_list)

        # 以下是通过手动拆分用户输入的字符串进行contains形式的检索。正则匹配拆分搜索词比较麻烦。
        
        # 从postgreql的websearch_to_tsquery学习而来。也就是django的searchQuery('少阴病 手足寒', search_type='websearch')
        
        # websearch_to_tsquery使用另一种语法创建一个tsquery值querytext,其中简单的无格式文本是有效的查询。与plainto_tsqueryand不同phraseto_tsquery,它还可以识别某些运算符。此外,此函数永远不会引发语法错误,这使得使用原始用户提供的输入进行搜索成为可能。支持以下语法:
        # unquoted
        # text: 不在引号内的文本将被转换为由 & 运算符分隔的术语,就像由.处理一样plainto_tsquery。
        # "quoted text": 引号内的文本将被转换为由 <->运算符分隔的术语,就像由.处理一样phraseto_tsquery。
        # OR:单词“或”将被转换为 | 运算符。
        # -:破折号将转换为!运算符。
        # https://www.postgresql.org/docs/current/textsearch-controls.html

        ss = '汗 or 渴 or 咳 桂枝 -热'
        # 1,提取or,先把or连起来,空格区分and和exclude。2,提取and。3,提取exclude。
        ll = ss.replace(' or ', '_or_').split(' ')
        print(ll)
        or_and_or_list = list(filter(lambda k: '_or_' in k, ll))
        print(or_and_or_list)
        q_or_list = []
        for x in or_and_or_list:
            or_list = x.split('_or_')
            print(or_list, 'or')
            s = reduce(lambda q1, q2: q1 | q2, [Q(content__contains=k) for k in or_list])
            print(s)
            q_or_list.append(s)
            qs = Tw.objects.filter(s)
            print(len(qs), qs)
        print(q_or_list)
        sx = reduce(lambda q1, q2: q1 & q2, q_or_list)
        print(sx)
        qs0 = Tw.objects.filter(sx)
        print(len(qs0), qs0)

        and_list = list(filter(lambda k: '_or_' not in k and not k.startswith('-'), ll))
        print(and_list)
        s1 = reduce(lambda q1, q2: q1 & q2, [Q(content__contains=k) for k in and_list])
        print(s1)
        qs1 = Tw.objects.filter(sx).filter(s1)
        print(len(qs1), qs1)

        exclude_list = list(filter(lambda k: k.startswith('-'), ll))
        print(exclude_list)
        s2 = reduce(lambda q1, q2: q1 & q2, [Q(content__contains=k.replace('-', '')) for k in exclude_list])
        print(s2)
        qs2 = Tw.objects.filter(sx).filter(s1).exclude(s2)
        print(len(qs2), qs2)

        self.stdout.write(self.style.SUCCESS('Successfully'))