Skip to content
ngadmini edited this page Oct 3, 2022 · 10 revisions

use this bash_script to obstain invalid TLD coming from : https://raw.githubusercontent.com/alsyundawy/dnstrust-apjii/main/raw/db_trustpositif.txt

bash_script

run this script to check the changes that occurred in the db_trustpositif.txt. then modify the grab_regex according to the changes

#!/usr/bin/env bash
# TAGS
#   tlds_validation.sh
#   v1-beta
# AUTHOR
#   [email protected]
# TL;DR
#  exception: domains with non ASCII character and ended with port = some.domains:900

set -e
PATH=/usr/local/bin:/usr/bin:/bin:${PATH}
_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
_reg1="/#/d;s/[A-Z]/\L&/g"
_reg2="s/[A-Z]/\L&/g;/\(:\|\.\)[0-9]\{2,\}$/d"
_url1="http://data.iana.org/TLD/tlds-alpha-by-domain.txt"
_url2="https://raw.githubusercontent.com/alsyundawy/dnstrust-apjii/main/raw/db_trustpositif.txt"

cd "${_DIR}"
printf "\n[INFO] starting tld validation, target: %s\n" "$(basename "${_url2}")"
find . -type f -name "*.txt" -print0 | xargs -r0 rm -rf

for _X in "${_url1}" "${_url2}"; do
   [[ $(curl -s -o /dev/null -w "%{http_code}" "${_X}") == 200 ]] || exit 1
done

curl -s "${_url1}" | sed "${_reg1}" >> tlds-alpha-by-domain.txt
curl -s "${_url2}" | sed "${_reg2}" | awk -F. '{print $NF}' | sort -u | LC_ALL=C grep -Pv "[^\x00-\x7F]" >> tlds_trust.txt

sort tlds_trust.txt tlds-alpha-by-domain.txt | uniq -d > valid_tlds.txt
sort valid_tlds.txt tlds_trust.txt | uniq -u > invalid_tlds.txt

printf "[INFO] there are %'d invalid tlds\n" "$(wc -l invalid_tlds.txt | awk -F' ' '{printf $(NF-1)}')"
printf "[INFO] build regex\nyou can replace grab_regex line 4 with this regex.txt\n"
#tr '\n' ' ' < invalid_tlds.txt > regex.txt
#sed -i 's/ /\\\|/g;s/^/\/\\.\\\(/;s/|$/\)\$\/d/' regex.txt
sed ':a;N;$!ba;s/\n/\\\|/g' invalid_tlds.txt | sed 's/^/\/\\.\\\(/;s/$/\\\)\$\/d/' > regex.txt

printf "bye!\n"
exit 0