diff --git a/.github/workflows/helm-push.yml b/.github/workflows/helm-push.yml new file mode 100644 index 0000000..91329ee --- /dev/null +++ b/.github/workflows/helm-push.yml @@ -0,0 +1,134 @@ +name: Publish OpenG2P Helm charts on + +on: + push: + tags-ignore: + - '**' + branches: + - 1.* + - develop + - main + workflow_dispatch: + inputs: + forcePublishCharts: + description: "Force publish Charts?" + default: "*" + type: string + +jobs: + generate-charts: + runs-on: ubuntu-latest + env: + SKIP: 'FALSE' + RANCHER_CHART_FILTER: "openg2p.org/add-to-rancher" + FORCE_PUBLISH_CHARTS: "${{ inputs.forcePublishCharts || '' }}" + defaults: + run: + shell: bash + steps: + - name: Checkout Repository + uses: actions/checkout@v3 + + - id: files + if: env.FORCE_PUBLISH_CHARTS == '' + uses: jitterbit/get-changed-files@v1 + + - name: save helm/charts to tmp.txt file + run: | + touch charts-list.txt + if [ -n "${FORCE_PUBLISH_CHARTS}" ]; then + for chart in charts/${FORCE_PUBLISH_CHARTS}/; do + chart="${chart#charts/}" + chart="${chart%/}" + echo "$chart" >> charts-list.txt + done + else + for changed_file in ${{ steps.files.outputs.all }}; do + if [[ ${changed_file} =~ ^charts ]]; then + chart_name=$(echo "${changed_file}" | awk -F/ '/^[charts]/{print $2}') + echo $chart_name >> charts-list.txt; + echo "Saved $chart_name chart to charts-list.txt" + fi + done + cat charts-list.txt | sort | uniq > charts-list-unique.txt + mv charts-list-unique.txt charts-list.txt + fi + echo "List of charts to be published"; + cat charts-list.txt + + - name: Generate tar files + run: | + if [[ ! -s charts-list.txt ]]; then + echo "::warning::No Charts to publish"; + echo "SKIP=TRUE" >> $GITHUB_ENV + else + for chartpath in charts/*/; do + if [ -f ${chartpath}Chart.yaml ]; then + helm dep up $chartpath + fi + done + RANCHER_CHARTS=() + while IFS= read -r chartpath; do + echo "chartpath: $chartpath" + chartname=$(basename "$chartpath") + if [ -f charts/${chartname}/Chart.yaml ]; then + echo "Chartname: $chartname" + helm package charts/$chartpath + is_rancher_chart=$(grep "$RANCHER_CHART_FILTER" charts/${chartpath%*/}/Chart.yaml || true) + if [ -n "$is_rancher_chart" ]; then + RANCHER_CHARTS+=("$chartname") + fi + fi + done < charts-list.txt + echo "RANCHER_CHARTS=${RANCHER_CHARTS[@]}" >> $GITHUB_ENV + rm charts-list.txt + fi + + shopt -s nocasematch + if [[ '${{ github.repository_owner }}' != 'OpenG2P' ]]; then + echo "SKIP=TRUE" >> $GITHUB_ENV + fi + - name: Upload tar as Artifact + uses: actions/upload-artifact@v4 + with: + name: charts + path: ./*.tgz + if: env.SKIP != 'TRUE' + + - name: Checkout branch for publishing + uses: actions/checkout@v3 + with: + repository: 'openg2p/openg2p-helm' + ref: gh-pages + token: ${{ secrets.OPENG2P_BOT_GITHUB_PAT }} + if: env.SKIP != 'TRUE' + + - name: Download tar from Artifacts + uses: actions/download-artifact@v4 + with: + name: charts + path: ./ + if: env.SKIP != 'TRUE' + + - name: Update index.yaml + run: | + helm repo index --url https://openg2p.github.io/openg2p-helm/ . + for chartname in $RANCHER_CHARTS; do + cp ${chartname}*.tgz rancher/ + done + helm repo index --url https://openg2p.github.io/openg2p-helm/ --merge rancher/index.yaml rancher + for chartname in $RANCHER_CHARTS; do + rm rancher/${chartname}*.tgz || true + done + if: env.SKIP != 'TRUE' + + - name: Commit Changes to repository + uses: EndBug/add-and-commit@v7 + with: + branch: gh-pages + author_name: openg2pbot + author_email: bot@openg2p.org + default_author: user_info + message: 'added deduplicator helm charts for publish openg2p/openg2p-deduplicator@${{ github.sha }}' + add: './*.tgz ./index.yaml rancher/index.yaml' + if: env.SKIP != 'TRUE' diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 3938ad1..916c0ab 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -26,6 +26,7 @@ repos: - id: check-symlinks - id: check-toml - id: check-yaml + exclude: ^charts/.*/templates/ args: - --unsafe - id: mixed-line-ending diff --git a/README.md b/README.md index 800259e..efaa276 100644 --- a/README.md +++ b/README.md @@ -6,6 +6,8 @@ Deduplication service of OpenG2P offering fuzzy match on demographic data. +Refer to https://docs.openg2p.org. + ## Available Packages Package | Description diff --git a/charts/openg2p-deduplicator/.gitignore b/charts/openg2p-deduplicator/.gitignore new file mode 100644 index 0000000..ee3892e --- /dev/null +++ b/charts/openg2p-deduplicator/.gitignore @@ -0,0 +1 @@ +charts/ diff --git a/charts/openg2p-deduplicator/.helmignore b/charts/openg2p-deduplicator/.helmignore new file mode 100644 index 0000000..f0c1319 --- /dev/null +++ b/charts/openg2p-deduplicator/.helmignore @@ -0,0 +1,21 @@ +# Patterns to ignore when building packages. +# This supports shell glob matching, relative path matching, and +# negation (prefixed with !). Only one pattern per line. +.DS_Store +# Common VCS dirs +.git/ +.gitignore +.bzr/ +.bzrignore +.hg/ +.hgignore +.svn/ +# Common backup files +*.swp +*.bak +*.tmp +*~ +# Various IDEs +.project +.idea/ +*.tmproj diff --git a/charts/openg2p-deduplicator/Chart.lock b/charts/openg2p-deduplicator/Chart.lock new file mode 100644 index 0000000..b49a008 --- /dev/null +++ b/charts/openg2p-deduplicator/Chart.lock @@ -0,0 +1,9 @@ +dependencies: +- name: common + repository: oci://registry-1.docker.io/bitnamicharts + version: 2.24.0 +- name: opensearch + repository: oci://registry-1.docker.io/bitnamicharts + version: 1.2.0 +digest: sha256:9382dcc5ef4ac9e3c203e428d70c8c9f7de47a0ae6b2197d77696cfcdb556345 +generated: "2024-10-11T13:06:07.180171021+05:30" diff --git a/charts/openg2p-deduplicator/Chart.yaml b/charts/openg2p-deduplicator/Chart.yaml new file mode 100644 index 0000000..762b4ba --- /dev/null +++ b/charts/openg2p-deduplicator/Chart.yaml @@ -0,0 +1,26 @@ +apiVersion: v2 +name: openg2p-deduplicator +description: A Helm chart for OpenG2P Deduplicator +type: application +version: 0.0.0-develop +appVersion: "" +dependencies: +- name: common + repository: oci://registry-1.docker.io/bitnamicharts + version: 2.x.x +- name: opensearch + version: 1.2.0 + repository: oci://registry-1.docker.io/bitnamicharts + condition: opensearch.enabled +home: https://openg2p.org +keywords: + - openg2p + - deduplication + - deduplicator + - opensearch +maintainers: + - email: info@openg2p.org + name: OpenG2P +icon: https://openg2p.github.io/openg2p-helm/openg2p-logo.png +annotations: + catalog.cattle.io/display-name: "OpenG2P Deduplicator" diff --git a/charts/openg2p-deduplicator/README.md b/charts/openg2p-deduplicator/README.md new file mode 100644 index 0000000..03f0828 --- /dev/null +++ b/charts/openg2p-deduplicator/README.md @@ -0,0 +1,5 @@ +# OpenG2P Deduplicator + +Helm chart for installing OpenG2P Deduplicator. + +Refer to https://docs.openg2p.org. diff --git a/charts/openg2p-deduplicator/app-readme.md b/charts/openg2p-deduplicator/app-readme.md new file mode 100644 index 0000000..e69de29 diff --git a/charts/openg2p-deduplicator/questions.yaml b/charts/openg2p-deduplicator/questions.yaml new file mode 100644 index 0000000..7fba6d5 --- /dev/null +++ b/charts/openg2p-deduplicator/questions.yaml @@ -0,0 +1,35 @@ +questions: +- variable: opensearch.enabled + description: This installs OpenSearch along with the deduplicator + type: boolean + label: Install OpenSearch? + +- variable: opensearch.dashboards.enabled + description: This installs OpenSearch Dashboards along with OpenSearch + type: boolean + label: Install OpenSearch Dashboards? + show_if: "opensearch.enabled=true" + +- variable: opensearch.hostname + description: Hostname on which OpenSearch Dashboards should be accessed + type: string + label: OpenSearch Hostname + show_if: "opensearch.enabled=true&&opensearch.dashboards.enabled=true" + +- variable: global.keycloakBaseUrl + description: Required for Authentication. + type: string + label: Keycloak Base URL + show_if: "opensearch.enabled=true&&opensearch.dashboards.enabled=true" + +- variable: opensearch.oidcClientId + description: OIDC Client ID for OpenSearch login + type: string + label: OpenSearch OIDC Client ID + show_if: "opensearch.enabled=true&&opensearch.dashboards.enabled=true" + +- variable: opensearch.oidcClientSecret + description: OIDC Client Secret for OpenSearch login + type: string + label: OpenSearch OIDC Client Secret + show_if: "opensearch.enabled=true&&opensearch.dashboards.enabled=true" diff --git a/charts/openg2p-deduplicator/templates/_helpers.tpl b/charts/openg2p-deduplicator/templates/_helpers.tpl new file mode 100644 index 0000000..8107f53 --- /dev/null +++ b/charts/openg2p-deduplicator/templates/_helpers.tpl @@ -0,0 +1,46 @@ +{{/* +Return the proper image name +*/}} +{{- define "deduplicator.image" -}} +{{ include "common.images.image" (dict "imageRoot" .Values.image "global" .Values.global) }} +{{- end -}} + +{{/* +Return the proper Docker Image Registry Secret Names +*/}} +{{- define "deduplicator.imagePullSecrets" -}} +{{- include "common.images.pullSecrets" (dict "images" (list .Values.image) "global" .Values.global) -}} +{{- end -}} + +{{/* +Create the name of the service account to use +*/}} +{{- define "deduplicator.serviceAccountName" -}} +{{- if .Values.serviceAccount.create -}} + {{ default (printf "%s-foo" (include "common.names.fullname" .)) .Values.serviceAccount.name }} +{{- else -}} + {{ default "default" .Values.serviceAccount.name }} +{{- end -}} +{{- end -}} + +{{/* +Render Env values section +*/}} +{{- define "deduplicator.baseEnvVars" -}} +{{- $context := .context -}} +{{- range $k, $v := .envVars }} +- name: {{ $k }} +{{- if or (kindIs "int64" $v) (kindIs "float64" $v) (kindIs "bool" $v) }} + value: {{ $v | quote }} +{{- else if kindIs "string" $v }} + value: {{ include "common.tplvalues.render" ( dict "value" $v "context" $context ) | squote }} +{{- else }} + valueFrom: {{- include "common.tplvalues.render" ( dict "value" $v "context" $context ) | nindent 4}} +{{- end }} +{{- end }} +{{- end -}} + +{{- define "deduplicator.envVars" -}} +{{- $envVars := merge (deepCopy .Values.envVars) (deepCopy .Values.envVarsFrom) -}} +{{- include "deduplicator.baseEnvVars" (dict "envVars" $envVars "context" $) }} +{{- end -}} diff --git a/charts/openg2p-deduplicator/templates/deployment.yaml b/charts/openg2p-deduplicator/templates/deployment.yaml new file mode 100644 index 0000000..f917f55 --- /dev/null +++ b/charts/openg2p-deduplicator/templates/deployment.yaml @@ -0,0 +1,93 @@ +apiVersion: {{ include "common.capabilities.deployment.apiVersion" . }} +kind: Deployment +metadata: + name: {{ template "common.names.fullname" . }} + labels: {{- include "common.labels.standard" (dict "customLabels" .Values.commonLabels "context" $) | nindent 4 }} + {{- if .Values.commonAnnotations }} + annotations: {{- include "common.tplvalues.render" ( dict "value" .Values.commonAnnotations "context" $ ) | nindent 4 }} + {{- end }} +spec: + replicas: {{ .Values.replicaCount }} + {{- if .Values.updateStrategy }} + strategy: {{- toYaml .Values.updateStrategy | nindent 4 }} + {{- end }} + selector: + matchLabels: {{- include "common.labels.matchLabels" (dict "customLabels" .Values.podLabels "context" $) | nindent 6 }} + template: + metadata: + {{- if .Values.podAnnotations }} + annotations: {{- include "common.tplvalues.render" (dict "value" .Values.podAnnotations "context" $) | nindent 8 }} + {{- end }} + labels: {{- include "common.labels.standard" (dict "customLabels" .Values.podLabels "context" $) | nindent 8 }} + spec: + serviceAccountName: {{ template "deduplicator.serviceAccountName" . }} + {{- include "deduplicator.imagePullSecrets" . | nindent 6 }} + {{- if .Values.hostAliases }} + hostAliases: {{- include "common.tplvalues.render" (dict "value" .Values.hostAliases "context" $) | nindent 8 }} + {{- end }} + {{- if .Values.affinity }} + affinity: {{- include "common.tplvalues.render" ( dict "value" .Values.affinity "context" $) | nindent 8 }} + {{- else }} + affinity: + podAffinity: {{- include "common.affinities.pods" (dict "type" .Values.podAffinityPreset "context" $) | nindent 10 }} + podAntiAffinity: {{- include "common.affinities.pods" (dict "type" .Values.podAntiAffinityPreset "context" $) | nindent 10 }} + nodeAffinity: {{- include "common.affinities.nodes" (dict "type" .Values.nodeAffinityPreset.type "key" .Values.nodeAffinityPreset.key "values" .Values.nodeAffinityPreset.values) | nindent 10 }} + {{- end }} + {{- if .Values.nodeSelector }} + nodeSelector: {{- include "common.tplvalues.render" ( dict "value" .Values.nodeSelector "context" $) | nindent 8 }} + {{- end }} + {{- if .Values.tolerations }} + tolerations: {{- include "common.tplvalues.render" (dict "value" .Values.tolerations "context" .) | nindent 8 }} + {{- end }} + {{- if .Values.priorityClassName }} + priorityClassName: {{ .Values.priorityClassName | quote }} + {{- end }} + {{- if .Values.podSecurityContext.enabled }} + securityContext: {{- omit .Values.podSecurityContext "enabled" | toYaml | nindent 8 }} + {{- end }} + {{- if .Values.initContainers }} + initContainers: + {{- include "common.tplvalues.render" (dict "value" .Values.initContainers "context" $) | nindent 8 }} + {{- end }} + containers: + - name: deduplicator + image: {{ template "deduplicator.image" . }} + imagePullPolicy: {{ .Values.image.pullPolicy }} + {{- if .Values.lifecycleHooks }} + lifecycle: {{- include "common.tplvalues.render" (dict "value" .Values.lifecycleHooks "context" $) | nindent 12 }} + {{- end }} + {{- if .Values.containerSecurityContext.enabled }} + securityContext: {{- omit .Values.containerSecurityContext "enabled" | toYaml | nindent 12 }} + {{- end }} + {{- if .Values.command }} + command: {{- include "common.tplvalues.render" (dict "value" .Values.command "context" $) | nindent 12 }} + {{- end }} + {{- if .Values.args }} + args: {{- include "common.tplvalues.render" (dict "value" .Values.args "context" $) | nindent 12 }} + {{- end }} + env: + {{- include "deduplicator.envVars" . | nindent 12 }} + ports: + - name: http + containerPort: {{ .Values.containerPort }} + {{- if .Values.resources }} + resources: {{- toYaml .Values.resources | nindent 12 }} + {{- end }} + {{- if .Values.startupProbe.enabled }} + startupProbe: {{- include "common.tplvalues.render" (dict "value" (omit .Values.startupProbe "enabled") "context" $) | nindent 12 }} + {{- end }} + {{- if .Values.livenessProbe.enabled }} + livenessProbe: {{- include "common.tplvalues.render" (dict "value" (omit .Values.livenessProbe "enabled") "context" $) | nindent 12 }} + {{- end }} + {{- if .Values.readinessProbe.enabled }} + readinessProbe: {{- include "common.tplvalues.render" (dict "value" (omit .Values.readinessProbe "enabled") "context" $) | nindent 12 }} + {{- end }} + {{- if .Values.extraVolumeMounts }} + volumeMounts: {{- include "common.tplvalues.render" (dict "value" .Values.extraVolumeMounts "context" $) | nindent 12 }} + {{- end }} + {{- if .Values.sidecars }} + {{- include "common.tplvalues.render" ( dict "value" .Values.sidecars "context" $) | nindent 8 }} + {{- end }} + {{- if .Values.extraVolumes }} + volumes: {{- include "common.tplvalues.render" (dict "value" .Values.extraVolumes "context" $) | nindent 8 }} + {{- end }} diff --git a/charts/openg2p-deduplicator/templates/extra-list.yaml b/charts/openg2p-deduplicator/templates/extra-list.yaml new file mode 100644 index 0000000..9ac65f9 --- /dev/null +++ b/charts/openg2p-deduplicator/templates/extra-list.yaml @@ -0,0 +1,4 @@ +{{- range .Values.extraDeploy }} +--- +{{ include "common.tplvalues.render" (dict "value" . "context" $) }} +{{- end }} diff --git a/charts/openg2p-deduplicator/templates/gateway.yaml b/charts/openg2p-deduplicator/templates/gateway.yaml new file mode 100644 index 0000000..b6ffae0 --- /dev/null +++ b/charts/openg2p-deduplicator/templates/gateway.yaml @@ -0,0 +1,38 @@ +{{- if .Values.istio.enabled }} +{{- if .Values.istio.gateway.enabled }} +apiVersion: networking.istio.io/v1beta1 +kind: Gateway +metadata: + name: {{ include "common.names.fullname" . }} + labels: {{ include "common.labels.standard" (dict "customLabels" .Values.commonLabels "context" $) | nindent 4 }} + {{- if .Values.commonAnnotations }} + annotations: {{ include "common.tplvalues.render" (dict "value" .Values.commonAnnotations "context" $) | nindent 4 }} + {{- end }} +spec: + selector: + {{ toYaml .Values.istio.gateway.ingressController | nindent 4 }} + servers: + {{- if .Values.istio.gateway.httpEnabled }} + - port: + name: http2 + number: 8080 + protocol: HTTP2 + hosts: + - {{ default .Values.hostname .Values.istio.gateway.host | quote }} + {{- if .Values.istio.gateway.httpTlsRedirect }} + tls: + httpsRedirect: true + {{- end }} + {{- end }} + {{- if .Values.istio.gateway.httpsEnabled }} + - port: + name: https + number: 8443 + protocol: HTTPS + hosts: + - {{ default .Values.hostname .Values.istio.gateway.host | quote }} + tls: + {{ toYaml (omit .Values.istio.gateway.tls "enabled") | nindent 6 }} + {{- end }} +{{- end }} +{{- end }} diff --git a/charts/openg2p-deduplicator/templates/opensearch/configmap.yaml b/charts/openg2p-deduplicator/templates/opensearch/configmap.yaml new file mode 100644 index 0000000..3e475d8 --- /dev/null +++ b/charts/openg2p-deduplicator/templates/opensearch/configmap.yaml @@ -0,0 +1,23 @@ +{{- if .Values.opensearch.enabled }} +apiVersion: v1 +kind: ConfigMap +metadata: + name: {{ include "common.names.fullname" .Subcharts.opensearch }}-custom-config + labels: {{ include "common.labels.standard" (dict "customLabels" .Values.opensearch.commonLabels "context" .Subcharts.opensearch) | nindent 4 }} + {{- if .Values.opensearch.commonAnnotations }} + annotations: {{ include "common.tplvalues.render" (dict "value" .Values.opensearch.commonAnnotations "context" $) | nindent 4 }} + {{- end }} +data: + {{- if .Values.opensearch.security.enabled }} + {{- if .Values.opensearch.security.extraConfig }} + opensearch-security-config.yml: |- + {{- include "common.tplvalues.render" (dict "value" .Values.opensearch.security.extraConfig "context" $) | nindent 4 }} + {{- end }} + {{- end }} + {{- if .Values.opensearch.dashboards.enabled }} + {{- if .Values.opensearch.dashboards.extraConfig }} + opensearch_dashboards.yml: |- + {{- include "common.tplvalues.render" (dict "value" .Values.opensearch.dashboards.extraConfig "context" $) | nindent 4 }} + {{- end }} + {{- end }} +{{- end }} diff --git a/charts/openg2p-deduplicator/templates/opensearch/gateway.yaml b/charts/openg2p-deduplicator/templates/opensearch/gateway.yaml new file mode 100644 index 0000000..03c9ce9 --- /dev/null +++ b/charts/openg2p-deduplicator/templates/opensearch/gateway.yaml @@ -0,0 +1,42 @@ +{{- if .Values.opensearch.enabled }} +{{- if .Values.opensearch.dashboards.enabled }} +{{- if .Values.opensearch.istio.enabled }} +{{- if .Values.opensearch.istio.gateway.enabled }} +apiVersion: networking.istio.io/v1beta1 +kind: Gateway +metadata: + name: {{ include "common.names.fullname" .Subcharts.opensearch }} + labels: {{ include "common.labels.standard" (dict "customLabels" .Values.opensearch.commonLabels "context" .Subcharts.opensearch) | nindent 4 }} + {{- if .Values.opensearch.commonAnnotations }} + annotations: {{ include "common.tplvalues.render" (dict "value" .Values.opensearch.commonAnnotations "context" $) | nindent 4 }} + {{- end }} +spec: + selector: + {{ toYaml .Values.opensearch.istio.gateway.ingressController | nindent 4 }} + servers: + {{- if .Values.opensearch.istio.gateway.httpEnabled }} + - port: + name: http2 + number: 8080 + protocol: HTTP2 + hosts: + - {{ default .Values.opensearch.hostname .Values.opensearch.istio.gateway.host | quote }} + {{- if .Values.opensearch.istio.gateway.httpTlsRedirect }} + tls: + httpsRedirect: true + {{- end }} + {{- end }} + {{- if .Values.opensearch.istio.gateway.httpsEnabled }} + - port: + name: https + number: 8443 + protocol: HTTPS + hosts: + - {{ default .Values.opensearch.hostname .Values.opensearch.istio.gateway.host | quote }} + tls: + {{ toYaml (omit .Values.opensearch.istio.gateway.tls "enabled") | nindent 6 }} + {{- end }} +{{- end }} +{{- end }} +{{- end }} +{{- end }} diff --git a/charts/openg2p-deduplicator/templates/opensearch/virtualservice.yaml b/charts/openg2p-deduplicator/templates/opensearch/virtualservice.yaml new file mode 100644 index 0000000..d23cf7a --- /dev/null +++ b/charts/openg2p-deduplicator/templates/opensearch/virtualservice.yaml @@ -0,0 +1,32 @@ +{{- if .Values.opensearch.enabled }} +{{- if .Values.opensearch.dashboards.enabled }} +{{- if .Values.opensearch.istio.enabled }} +{{- if .Values.opensearch.istio.virtualservice.enabled }} +apiVersion: networking.istio.io/v1alpha3 +kind: VirtualService +metadata: + name: {{ include "common.names.fullname" .Subcharts.opensearch }} + labels: {{ include "common.labels.standard" (dict "customLabels" .Values.opensearch.commonLabels "context" .Subcharts.opensearch) | nindent 4 }} + {{- if .Values.opensearch.commonAnnotations }} + annotations: {{ include "common.tplvalues.render" (dict "value" .Values.opensearch.commonAnnotations "context" $) | nindent 4 }} + {{- end }} +spec: + hosts: + - {{ default .Values.opensearch.hostname .Values.opensearch.istio.virtualservice.host | quote }} + gateways: + - {{ default (include "common.names.fullname" .Subcharts.opensearch) .Values.opensearch.istio.virtualservice.gateway }} + http: + - headers: + request: + set: + x-forwarded-host: {{ default .Values.opensearch.hostname .Values.opensearch.istio.virtualservice.host | quote }} + x-forwarded-proto: https + route: + - destination: + host: {{ include "common.tplvalues.render" (dict "value" .Values.opensearch.istio.virtualservice.destination "context" .Subcharts.opensearch) }} + port: + number: {{ include "common.tplvalues.render" (dict "value" .Values.opensearch.istio.virtualservice.destinationPort "context" .Subcharts.opensearch) }} +{{- end }} +{{- end }} +{{- end }} +{{- end }} diff --git a/charts/openg2p-deduplicator/templates/service-account.yaml b/charts/openg2p-deduplicator/templates/service-account.yaml new file mode 100644 index 0000000..6cff4f0 --- /dev/null +++ b/charts/openg2p-deduplicator/templates/service-account.yaml @@ -0,0 +1,10 @@ +{{- if .Values.serviceAccount.create }} +apiVersion: v1 +kind: ServiceAccount +metadata: + name: {{ template "deduplicator.serviceAccountName" . }} + labels: {{- include "common.labels.standard" (dict "customLabels" .Values.commonLabels "context" $) | nindent 4 }} + {{- if .Values.commonAnnotations }} + annotations: {{- include "common.tplvalues.render" ( dict "value" .Values.commonAnnotations "context" $ ) | nindent 4 }} + {{- end }} +{{- end }} diff --git a/charts/openg2p-deduplicator/templates/service.yaml b/charts/openg2p-deduplicator/templates/service.yaml new file mode 100644 index 0000000..12558df --- /dev/null +++ b/charts/openg2p-deduplicator/templates/service.yaml @@ -0,0 +1,25 @@ +apiVersion: v1 +kind: Service +metadata: + name: {{ template "common.names.fullname" . }} + labels: {{- include "common.labels.standard" (dict "customLabels" .Values.commonLabels "context" $) | nindent 4 }} + {{- if .Values.commonAnnotations }} + annotations: {{- include "common.tplvalues.render" ( dict "value" .Values.commonAnnotations "context" $ ) | nindent 4 }} + {{- end }} +spec: + type: {{ .Values.service.type }} + {{- if (or (eq .Values.service.type "LoadBalancer") (eq .Values.service.type "NodePort")) }} + externalTrafficPolicy: {{ .Values.service.externalTrafficPolicy | quote }} + {{- end }} + {{ if eq .Values.service.type "LoadBalancer" }} + loadBalancerSourceRanges: {{ .Values.service.loadBalancerSourceRanges }} + {{ end }} + {{- if (and (eq .Values.service.type "LoadBalancer") (not (empty .Values.service.loadBalancerIP))) }} + loadBalancerIP: {{ .Values.service.loadBalancerIP }} + {{- end }} + ports: + - name: http + port: {{ .Values.service.port }} + targetPort: {{ .Values.containerPort }} + protocol: TCP + selector: {{- include "common.labels.matchLabels" (dict "customLabels" .Values.podLabels "context" $) | nindent 4 }} diff --git a/charts/openg2p-deduplicator/templates/virtualservice.yaml b/charts/openg2p-deduplicator/templates/virtualservice.yaml new file mode 100644 index 0000000..eeb871a --- /dev/null +++ b/charts/openg2p-deduplicator/templates/virtualservice.yaml @@ -0,0 +1,35 @@ +{{- if .Values.istio.enabled }} +{{- if .Values.istio.virtualservice.enabled }} +apiVersion: networking.istio.io/v1alpha3 +kind: VirtualService +metadata: + name: {{ include "common.names.fullname" . }} + labels: {{ include "common.labels.standard" (dict "customLabels" .Values.commonLabels "context" $) | nindent 4 }} + {{- if .Values.commonAnnotations }} + annotations: {{ include "common.tplvalues.render" (dict "value" .Values.commonAnnotations "context" $) | nindent 4 }} + {{- end }} +spec: + hosts: + - {{ default .Values.hostname .Values.istio.virtualservice.host | quote }} + gateways: + - {{ default (include "common.names.fullname" .) .Values.istio.virtualservice.gateway }} + http: + - headers: + request: + set: + x-forwarded-host: {{ default .Values.hostname .Values.istio.virtualservice.host | quote }} + x-forwarded-proto: https + match: + - uri: + prefix: {{ include "common.tplvalues.render" (dict "value" .Values.istio.virtualservice.prefix "context" $) }} + {{- if .Values.istio.virtualservice.rewriteUri }} + rewrite: + uri: {{ include "common.tplvalues.render" (dict "value" .Values.istio.virtualservice.rewriteUri "context" $) }} + {{- end }} + route: + - destination: + host: {{ include "common.tplvalues.render" (dict "value" .Values.istio.virtualservice.destination "context" $) }} + port: + number: {{ include "common.tplvalues.render" (dict "value" .Values.istio.virtualservice.destinationPort "context" $) }} +{{- end }} +{{- end }} diff --git a/charts/openg2p-deduplicator/values.yaml b/charts/openg2p-deduplicator/values.yaml new file mode 100644 index 0000000..b3ea567 --- /dev/null +++ b/charts/openg2p-deduplicator/values.yaml @@ -0,0 +1,479 @@ +## Global Docker image parameters +## Please, note that this will override the image parameters, including dependencies, configured to use the global value +## Current available global Docker image parameters: imageRegistry and imagePullSecrets +## +# global: +# imageRegistry: myRegistryName +# imagePullSecrets: +# - myRegistryKeySecretName +# storageClass: myStorageClass +global: + keycloakBaseUrl: https://keycloak.openg2p.sandbox.net + keycloakIssuerUrl: '{{ tpl .Values.global.keycloakBaseUrl $ }}/realms/master' + +hostname: socialregistry.openg2p.sandbox.net + +## Add labels to all the deployed resources +## +commonLabels: {} + +## Add annotations to all the deployed resources +## +commonAnnotations: {} + +## Extra objects to deploy (value evaluated as a template) +## +extraDeploy: [] + +## Number of nodes +## +replicaCount: 1 + +service: + type: ClusterIP + port: 80 + ## loadBalancerIP for the SuiteCRM Service (optional, cloud specific) + ## ref: http://kubernetes.io/docs/user-guide/services/#type-loadbalancer + ## + ## loadBalancerIP: + ## + ## nodePorts: + ## http: + ## https: + ## + nodePorts: + http: "" + https: "" + ## Enable client source IP preservation + ## ref http://kubernetes.io/docs/tasks/access-application-cluster/create-external-load-balancer/#preserving-the-client-source-ip + ## + externalTrafficPolicy: Cluster + +## Here the version used is slightly modified version of 1.1.5 - this has been done to make it compatiable +## with biosdk 'develop' version. TODO: Change this later +image: + registry: docker.io + repository: openg2p/openg2p-deduplicator + tag: develop + ## Specify a imagePullPolicy + ## Defaults to 'Always' if image tag is 'latest', else set to 'IfNotPresent' + ## ref: http://kubernetes.io/docs/user-guide/images/#pre-pulling-images + ## + pullPolicy: Always + ## Optionally specify an array of imagePullSecrets. + ## Secrets must be manually created in the namespace. + ## ref: https://kubernetes.io/docs/tasks/configure-pod-container/pull-image-private-registry/ + ## + # pullSecrets: + # - myRegistryKeySecretName + +containerPort: 8000 + +## Configure extra options for liveness and readiness probes +## ref: https://kubernetes.io/docs/tasks/configure-pod-container/configure-liveness-readiness-probes/#configure-probes +## +startupProbe: + enabled: true + httpGet: + path: /health + port: http + initialDelaySeconds: 0 + periodSeconds: 10 + timeoutSeconds: 5 + failureThreshold: 10 + successThreshold: 1 + +livenessProbe: + enabled: true + httpGet: + path: /health + port: http + initialDelaySeconds: 10 + periodSeconds: 20 + timeoutSeconds: 1 + failureThreshold: 3 + successThreshold: 1 + +readinessProbe: + enabled: true + httpGet: + path: /health + port: http + initialDelaySeconds: 0 + periodSeconds: 10 + timeoutSeconds: 1 + failureThreshold: 3 + successThreshold: 1 + +## +# existingConfigmap: + +## Command and args for running the container (set to default if not set). Use array form +## +command: [] +args: [] + +## Deployment pod host aliases +## https://kubernetes.io/docs/concepts/services-networking/add-entries-to-pod-etc-hosts-with-host-aliases/ +## +hostAliases: [] + +## ref: http://kubernetes.io/docs/user-guide/compute-resources/ +## +## We usually recommend not to specify default resources and to leave this as a conscious +## choice for the user. This also increases chances charts run on environments with little +## resources, such as Minikube. If you do want to specify resources, uncomment the following +## lines, adjust them as necessary. +# resources: +# limits: +# cpu: 1000m +# memory: 500Mi +# requests: +# cpu: 200m +# memory: 200Mi +resources: {} + +## ref: https://kubernetes.io/docs/tasks/configure-pod-container/security-context/#set-the-security-context-for-a-container +## +containerSecurityContext: + enabled: false + runAsUser: 1001 + runAsNonRoot: true + +## ref: https://kubernetes.io/docs/tasks/configure-pod-container/security-context/#set-the-security-context-for-a-pod +## +podSecurityContext: + enabled: false + fsGroup: 1001 + +## Pod affinity preset +## ref: https://kubernetes.io/docs/concepts/scheduling-eviction/assign-pod-node/#inter-pod-affinity-and-anti-affinity +## Allowed values: soft, hard +## +podAffinityPreset: "" + +## Pod anti-affinity preset +## Ref: https://kubernetes.io/docs/concepts/scheduling-eviction/assign-pod-node/#inter-pod-affinity-and-anti-affinity +## Allowed values: soft, hard +## +podAntiAffinityPreset: soft + +## Node affinity preset +## Ref: https://kubernetes.io/docs/concepts/scheduling-eviction/assign-pod-node/#node-affinity +## Allowed values: soft, hard +## +nodeAffinityPreset: + ## Node affinity type + ## Allowed values: soft, hard + ## + type: "" + ## Node label key to match + ## E.g. + ## key: "kubernetes.io/e2e-az-name" + ## + key: "" + ## Node label values to match + ## E.g. + ## values: + ## - e2e-az1 + ## - e2e-az2 + ## + values: [] + +## Affinity for pod assignment. Evaluated as a template. +## Ref: https://kubernetes.io/docs/concepts/configuration/assign-pod-node/#affinity-and-anti-affinity +## +affinity: {} + +## Node labels for pod assignment. Evaluated as a template. +## ref: https://kubernetes.io/docs/user-guide/node-selection/ +## +nodeSelector: {} + +## Tolerations for pod assignment. Evaluated as a template. +## ref: https://kubernetes.io/docs/concepts/configuration/taint-and-toleration/ +## +tolerations: [] + +## Pod extra labels +## ref: https://kubernetes.io/docs/concepts/overview/working-with-objects/labels/ +## +podLabels: {} + +## Annotations for server pods. +## ref: https://kubernetes.io/docs/concepts/overview/working-with-objects/annotations/ +## +podAnnotations: {} + +## pods' priority. +## ref: https://kubernetes.io/docs/concepts/configuration/pod-priority-preemption/ +## +# priorityClassName: "" + +## lifecycleHooks for the container to automate configuration before or after startup. +## +lifecycleHooks: {} + +## Update strategy - only really applicable for deployments with RWO PVs attached +## If replicas = 1, an update can get "stuck", as the previous pod remains attached to the +## PV, and the "incoming" pod can never start. Changing the strategy to "Recreate" will +## terminate the single previous pod, so that the new, incoming pod can attach to the PV +## +updateStrategy: + type: RollingUpdate + +## Additional environment variables to set +## Example: +## extraEnvVars: +## - name: FOO +## value: "bar" +## +extraEnvVars: [] + +## ConfigMap with extra environment variables +## +extraEnvVarsCM: + +## Secret with extra environment variables +## +extraEnvVarsSecret: + +## Extra volumes to add to the deployment +## +extraVolumes: [] + +## Extra volume mounts to add to the container +## +extraVolumeMounts: [] + +## Add init containers to the pods. +## Example: +## initContainers: +## - name: your-image-name +## image: your-image +## imagePullPolicy: Always +## ports: +## - name: portname +## containerPort: 1234 +## +initContainers: [] + +## Add sidecars to the pods. +## Example: +## sidecars: +## - name: your-image-name +## image: your-image +## imagePullPolicy: Always +## ports: +## - name: portname +## containerPort: 1234 +## +sidecars: [] + +## Specifies whether a ServiceAccount should be created +## +serviceAccount: + create: true + ## The name of the ServiceAccount to use. + ## If not set and create is true, a name is generated using the fullname template + ## + name: + +opensearch: + enabled: true + hostname: opensearch.openg2p.sandbox.net + + oidcWellKnownConfigUrl: '{{ tpl .Values.global.keycloakIssuerUrl $ }}/.well-known/openid-configuration' + oidcClientId: 'openg2p-deduplicator-opensearch-{{ .Release.Namespace }}' + oidcClientSecret: '' + + master: + replicaCount: 1 + persistence: + size: 8Gi + heapSize: 256m + masterOnly: false + resourcesPreset: "none" + + data: + replicaCount: 0 + # persistence: + # size: 8Gi + + coordinating: + replicaCount: 0 + + ingest: + replicaCount: 0 + + extraVolumes: + - name: security-config + configMap: + name: '{{ include "common.names.fullname" . }}-custom-config' + + extraVolumeMounts: + - name: security-config + mountPath: /opt/bitnami/opensearch/config.default/opensearch-security/config.yml + subPath: opensearch-security-config.yml + + security: + enabled: true + tls: + restEncryption: true + extraConfig: |- + _meta: + type: "config" + config_version: 2 + + config: + dynamic: + http: + anonymous_auth_enabled: false + xff: + enabled: false + authc: + basic_internal_auth_domain: + http_enabled: true + transport_enabled: true + order: 0 + description: "Authenticate via HTTP Basic against internal users database" + http_authenticator: + type: basic + challenge: false + authentication_backend: + type: internal + openid_auth_domain: + http_enabled: true + transport_enabled: true + order: 1 + description: "Authenticate via Keycloak OIDC" + http_authenticator: + type: openid + challenge: false + config: + subject_key: preferred_username + roles_key: client_roles + openid_connect_url: '{{ tpl .Values.opensearch.oidcWellKnownConfigUrl $ }}' + authentication_backend: + type: noop + kerberos_auth_domain: + http_enabled: false + transport_enabled: false + proxy_auth_domain: + http_enabled: false + transport_enabled: false + jwt_auth_domain: + http_enabled: false + transport_enabled: false + clientcert_auth_domain: + http_enabled: false + transport_enabled: false + ldap: + http_enabled: false + transport_enabled: false + authz: {} + + dashboards: + enabled: false + replicaCount: 1 + resourcesPreset: "none" + + extraVolumes: + - name: dashboards-config + configMap: + name: '{{ include "common.names.fullname" . }}-custom-config' + + extraVolumeMounts: + - name: dashboards-config + mountPath: /opt/bitnami/opensearch-dashboards/config.default/opensearch_dashboards.yml + subPath: opensearch_dashboards.yml + + extraConfig: |- + path: + data: /bitnami/opensearch-dashboards/data + opensearch: + requestHeadersAllowlist: + - authorization + - securitytenant + ssl: + verificationMode: none + opensearch_security: + auth: + type: + # - basicauth # Enable this to allow password based auth also + - openid + multiple_auth_enabled: true + openid: + connect_url: '{{ tpl .Values.opensearch.oidcWellKnownConfigUrl $ }}' + client_id: '{{ tpl .Values.opensearch.oidcClientId $ }}' + client_secret: '{{ tpl .Values.opensearch.oidcClientSecret $ }}' + base_redirect_url: 'https://{{ tpl .Values.opensearch.hostname $ }}' + logout_url: "" + ui: + openid: + login: + buttonname: "Log in with Keycloak" + multitenancy: + enabled: false # Enable this if multi tenancy is needed. + tenants: + preferred: + - Global + - Private + readonly_mode: + roles: + - kibana_read_only + istio: + enabled: true + virtualservice: + enabled: true + host: "" + gateway: "internal" + destination: '{{ include "common.names.fullname" . }}-dashboards' + destinationPort: '5601' + gateway: + enabled: false + host: "" + ingressController: + istio: ingressgateway + httpTlsRedirect: true + httpEnabled: true + httpsEnabled: false + tls: + mode: SIMPLE + credentialName: "" + +istio: + enabled: true + virtualservice: + enabled: true + host: "" + gateway: "internal" + destination: '{{ include "common.names.fullname" . }}' + destinationPort: '{{ .Values.service.port }}' + prefix: '{{ tpl .Values.envVars.DEDUPLICATOR_OPENAPI_ROOT_PATH $ }}' + rewriteUri: '/' + gateway: + enabled: false + host: "" + ingressController: + istio: ingressgateway + httpTlsRedirect: true + httpEnabled: true + httpsEnabled: false + tls: + mode: SIMPLE + credentialName: "" + +envVars: + DEDUPLICATOR_PORT: '{{ .Values.containerPort }}' + DEDUPLICATOR_OPENAPI_ROOT_PATH: "/v1/deduplicator/" + DEDUPLICATOR_OPENSEARCH_URL: '{{ tpl .Values.opensearchInstallationName $ }}' + DEDUPLICATOR_OPENSEARCH_USERNAME: admin + +envVarsFrom: + DEDUPLICATOR_OPENSEARCH_PASSWORD: + secretKeyRef: + name: '{{ tpl .Values.opensearchInstallationName $ }}' + key: opensearch-password + +opensearchInstallationName: '{{ include "common.names.fullname" $.Subcharts.opensearch }}' diff --git a/openg2p-deduplicator/src/openg2p_deduplicator/config.py b/openg2p-deduplicator/src/openg2p_deduplicator/config.py index eaf0760..2973e2a 100644 --- a/openg2p-deduplicator/src/openg2p_deduplicator/config.py +++ b/openg2p-deduplicator/src/openg2p_deduplicator/config.py @@ -26,5 +26,7 @@ class Settings(BaseSettings): index_name_dedupe_requests: str = "g2p_dedupe_requests" index_name_duplicates: str = "g2p_dedupe_duplicates" + duplicate_entry_id_joiner: str = "<->" + dedupe_runner_initial_delay_secs: int = 5 dedupe_runner_interval_secs: int = 10 diff --git a/openg2p-deduplicator/src/openg2p_deduplicator/controllers/get_duplicates_controller.py b/openg2p-deduplicator/src/openg2p_deduplicator/controllers/get_duplicates_controller.py index 8a62fc6..ca89ab3 100644 --- a/openg2p-deduplicator/src/openg2p_deduplicator/controllers/get_duplicates_controller.py +++ b/openg2p-deduplicator/src/openg2p_deduplicator/controllers/get_duplicates_controller.py @@ -1,6 +1,6 @@ from openg2p_fastapi_common.controller import BaseController -from ..schemas.get_duplicates_response import GetDuplicatesHttpResponse +from ..schemas.get_duplicates_response import GetDuplicatesHttpResponse, HttpDuplicateEntry from ..services.deduplication_service import DeduplicationService @@ -27,4 +27,12 @@ def deduplication_service(self): def get_duplicates_by_id(self, doc_id: str): res = self.deduplication_service.get_duplicates_by_doc_id(doc_id=doc_id) - return GetDuplicatesHttpResponse(duplicates=res.duplicates) + res = [ + HttpDuplicateEntry( + id=entry.duplicate_id, + match_score=entry.match_score, + last_dedupe_request_id=entry.last_dedupe_request_id, + ) + for entry in res + ] + return GetDuplicatesHttpResponse(duplicates=res) diff --git a/openg2p-deduplicator/src/openg2p_deduplicator/schemas/config_request.py b/openg2p-deduplicator/src/openg2p_deduplicator/schemas/config_request.py index 120042f..c09e8a2 100644 --- a/openg2p-deduplicator/src/openg2p_deduplicator/schemas/config_request.py +++ b/openg2p-deduplicator/src/openg2p_deduplicator/schemas/config_request.py @@ -1,14 +1,29 @@ from datetime import datetime +from enum import Enum from openg2p_fastapi_common.errors.http_exceptions import BadRequestError -from pydantic import BaseModel, field_validator +from pydantic import BaseModel, field_validator, model_validator + + +class DedupeConfigFieldQueryType(Enum): + term = "term" + match = "match" class DedupeConfigField(BaseModel): name: str + query_type: DedupeConfigFieldQueryType = DedupeConfigFieldQueryType.match fuzziness: str | None = None boost: float = 1 + @model_validator(mode="after") + def check_fuzziness_with_query_type(self): + if self.query_type == DedupeConfigFieldQueryType.term and self.fuzziness: + raise BadRequestError( + code="G2P-DEDUP-400", message="Fuzziness cannot be assigned for term queries." + ) + return self + class DedupeConfig(BaseModel): name: str diff --git a/openg2p-deduplicator/src/openg2p_deduplicator/schemas/deduplicate_request.py b/openg2p-deduplicator/src/openg2p_deduplicator/schemas/deduplicate_request.py index 1a9f748..5541a71 100644 --- a/openg2p-deduplicator/src/openg2p_deduplicator/schemas/deduplicate_request.py +++ b/openg2p-deduplicator/src/openg2p_deduplicator/schemas/deduplicate_request.py @@ -1,7 +1,7 @@ from datetime import datetime from enum import Enum -from pydantic import BaseModel, field_serializer +from pydantic import BaseModel class DeduplicationStatus(Enum): @@ -38,7 +38,3 @@ class DeduplicateRequestEntry(BaseModel): wait_before_exec_secs: int created_at: datetime updated_at: datetime | None = None - - @field_serializer("status") - def status_serialize(self, status: DeduplicationStatus): - return status.value diff --git a/openg2p-deduplicator/src/openg2p_deduplicator/schemas/get_duplicates_response.py b/openg2p-deduplicator/src/openg2p_deduplicator/schemas/get_duplicates_response.py index 1f93698..e4bf23c 100644 --- a/openg2p-deduplicator/src/openg2p_deduplicator/schemas/get_duplicates_response.py +++ b/openg2p-deduplicator/src/openg2p_deduplicator/schemas/get_duplicates_response.py @@ -1,15 +1,18 @@ from pydantic import BaseModel -class DuplicateEntry(BaseModel): - id: str +class StoredDuplicateEntry(BaseModel): + original_id: str + duplicate_id: str match_score: float last_dedupe_request_id: str -class StoredDuplicates(BaseModel): - duplicates: list[DuplicateEntry] +class HttpDuplicateEntry(BaseModel): + id: str + match_score: float + last_dedupe_request_id: str -class GetDuplicatesHttpResponse(StoredDuplicates): - pass +class GetDuplicatesHttpResponse(BaseModel): + duplicates: list[HttpDuplicateEntry] diff --git a/openg2p-deduplicator/src/openg2p_deduplicator/services/config_service.py b/openg2p-deduplicator/src/openg2p_deduplicator/services/config_service.py index 490ab70..f264642 100644 --- a/openg2p-deduplicator/src/openg2p_deduplicator/services/config_service.py +++ b/openg2p-deduplicator/src/openg2p_deduplicator/services/config_service.py @@ -25,7 +25,7 @@ def add_or_update_config(self, config: DedupeConfig): self.opensearch_client.index( index=_config.index_name_dedupe_configs, id=urllib.parse.quote(config.name, safe=""), - body=config.model_dump(), + body=config.model_dump(mode="json"), timeout=_config.opensearch_api_timeout, ) diff --git a/openg2p-deduplicator/src/openg2p_deduplicator/services/deduplication_service.py b/openg2p-deduplicator/src/openg2p_deduplicator/services/deduplication_service.py index 4730747..59fee0c 100644 --- a/openg2p-deduplicator/src/openg2p_deduplicator/services/deduplication_service.py +++ b/openg2p-deduplicator/src/openg2p_deduplicator/services/deduplication_service.py @@ -7,9 +7,9 @@ from openg2p_fastapi_common.utils.ctx_thread import CTXThread from ..config import Settings -from ..schemas.config_request import DedupeConfig +from ..schemas.config_request import DedupeConfig, DedupeConfigFieldQueryType from ..schemas.deduplicate_request import DeduplicateRequestEntry, DeduplicationStatus -from ..schemas.get_duplicates_response import DuplicateEntry, StoredDuplicates +from ..schemas.get_duplicates_response import StoredDuplicateEntry from ..services.config_service import DedupeConfigService from ..services.opensearch_service import OpenSearchClientService @@ -60,7 +60,7 @@ def create_dedupe_request( status_description="Deduplication in progress", wait_before_exec_secs=wait_before_exec_secs, created_at=datetime.now(), - ).model_dump(), + ).model_dump(mode="json"), ) return status @@ -76,16 +76,17 @@ def update_dedupe_request(self, request_id: str, key_value: dict): timeout=_config.opensearch_api_timeout, ) - def get_duplicates_by_doc_id(self, doc_id: str) -> StoredDuplicates: + def get_duplicates_by_doc_id(self, doc_id: str) -> list[StoredDuplicateEntry]: try: - res = self.opensearch_client.get_source( + res = self.opensearch_client.search( index=_config.index_name_duplicates, - id=doc_id, timeout=_config.opensearch_api_timeout, + body={"query": {"term": {"original_id": {"value": doc_id}}}}, ) + res = res.get("hits", {}).get("hits", []) except Exception: - res = {"duplicates": []} - res = StoredDuplicates.model_validate(res) + res = [] + res = [StoredDuplicateEntry.model_validate(entry.get("_source", {})) for entry in res] return res def run_dedupe_job(self): @@ -142,15 +143,17 @@ def run_dedupe_task(self): ) return # Find nested duplicates with the give config and update their entries - no_of_dups = self.find_and_update_duplicates_by_doc_id( + duplicates = self.find_duplicates_by_doc_id( dedupe_config, dedupe_request.id, dedupe_request.doc_id ) + # Create duplicate entries + self.create_duplicate_entries(dedupe_request.doc_id, duplicates, dedupe_request.id) # Update request status self.update_dedupe_request( dedupe_request.id, { "status": DeduplicationStatus.completed.value, - "status_description": f"Deduplication Complete. {no_of_dups} found.", + "status_description": f"Deduplication Complete. {len(duplicates or [])} found.", "updated_at": datetime.now(), }, ) @@ -166,9 +169,7 @@ def run_dedupe_task(self): ) pass - def find_and_update_duplicates_by_doc_id( - self, dedupe_config: DedupeConfig, dedupe_request_id, doc_id, already_updated_docs: list = None - ): + def find_duplicates_by_doc_id(self, dedupe_config: DedupeConfig, dedupe_request_id, doc_id): # Get Record with the given id query_time = datetime.now() try: @@ -186,54 +187,65 @@ def find_and_update_duplicates_by_doc_id( "updated_at": query_time, }, ) - return -1 + return [] # Construct match query with all fields. And search for other records - match_query = [] + query_list = [] for field in dedupe_config.fields: - query = { - "query": record.get(field.name), - "boost": field.boost, - } - if field.fuzziness: - query["fuzziness"] = field.fuzziness - match_query.append({"match": {field.name: query}}) + if field.query_type == DedupeConfigFieldQueryType.match: + match_query = { + "query": record.get(field.name), + "boost": field.boost, + } + if field.fuzziness: + match_query["fuzziness"] = field.fuzziness + query_list.append({"match": {field.name: match_query}}) + elif field.query_type == DedupeConfigFieldQueryType.term: + query_list.append( + { + "term": { + field.name: { + "value": record.get(field.name), + "boost": field.boost, + "case_insensitive": True, + } + } + } + ) duplicates_res = self.opensearch_client.search( - body={"_source": False, "query": {"bool": {"must": match_query}}}, + body={"_source": False, "query": {"bool": {"must": query_list}}}, index=dedupe_config.index, timeout=_config.opensearch_api_timeout, ) duplicates_res = duplicates_res.get("hits", {}).get("hits", []) duplicates_res = [duplicate for duplicate in duplicates_res if duplicate.get("_id") != doc_id] # Update duplicates in the current record and all other records - self.opensearch_client.index( - index=_config.index_name_duplicates, - id=doc_id, - body=StoredDuplicates( - duplicates=[ - DuplicateEntry( - id=duplicate.get("_id"), - match_score=duplicate.get("_score"), - last_dedupe_request_id=dedupe_request_id, - ) - for duplicate in duplicates_res - if (not dedupe_config.score_threshold) - or dedupe_config.score_threshold <= duplicate.get("_score") - ] - ).model_dump(), - timeout=_config.opensearch_api_timeout, - ) - already_updated_docs = already_updated_docs or [] - already_updated_docs.append(doc_id) - for duplicate in duplicates_res: - if (not dedupe_config.score_threshold) or ( - dedupe_config.score_threshold <= duplicate.get("_score") - ): - duplicate_id = duplicate.get("_id") - if duplicate_id not in already_updated_docs: - self.find_and_update_duplicates_by_doc_id( - dedupe_config, dedupe_request_id, duplicate_id, already_updated_docs - ) - return len(duplicates_res) + return duplicates_res + + def create_duplicate_entries(self, doc_id: str, duplicates: list, request_id: str): + for dup in duplicates: + # Create to and fro duplicate entries + self.opensearch_client.index( + index=_config.index_name_duplicates, + id=f"{doc_id}{_config.duplicate_entry_id_joiner}{dup.get('_id')}", + timeout=_config.opensearch_api_timeout, + body=StoredDuplicateEntry( + original_id=doc_id, + duplicate_id=dup.get("_id"), + match_score=dup.get("_score"), + last_dedupe_request_id=request_id, + ).model_dump(mode="json"), + ) + self.opensearch_client.index( + index=_config.index_name_duplicates, + id=f"{dup.get('_id')}{_config.duplicate_entry_id_joiner}{doc_id}", + timeout=_config.opensearch_api_timeout, + body=StoredDuplicateEntry( + original_id=dup.get("_id"), + duplicate_id=doc_id, + match_score=dup.get("_score"), + last_dedupe_request_id=request_id, + ).model_dump(mode="json"), + ) def is_runner_thread_alive(self): return self.dedupe_runner.is_alive()