From f142af8fe89041ee2902c65467020a7a644d2799 Mon Sep 17 00:00:00 2001
From: WillForan <willforan@gmail.com>
Date: Thu, 24 Oct 2024 21:58:51 -0400
Subject: [PATCH] =?UTF-8?q?=E2=9C=A8=20dcmmeta2tsv.py:=20slower=20but=20CS?=
 =?UTF-8?q?A=20capiable=20replacment=20for=20dicom=5Fhinfo?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 00_build_db.bash    | 48 ++++++++++++++++++++--------------------
 01_txt_to_sqlite.py | 22 ++++++++++++++-----
 dcmmeta2tsv.py      | 53 +++++++++++++++++++++++++++++++++++++++++++++
 taglist.txt         | 22 +++++++++++++++++++
 4 files changed, 115 insertions(+), 30 deletions(-)
 create mode 100755 dcmmeta2tsv.py
 create mode 100644 taglist.txt

diff --git a/00_build_db.bash b/00_build_db.bash
index 66663fe0..a1f234bc 100755
--- a/00_build_db.bash
+++ b/00_build_db.bash
@@ -2,31 +2,29 @@
 # quick pass at building minimal text database of dicom headers
 # 20240907 WF - init
 #
-declare -A t
-t[AcqTime]="0008,0032"       # Acquisition Time like 145446.685000
-t[AcqDate]="0008,0022"       # like 20241004
-t[SeriesNumber]="0020,0011"  # REL Series Number
-t[SubID]="0010,0010"         # patient name
-t[iPAT]="0051,1011"          # PATModeText (private field)
-t[Comments]="0020,4000"      #REL Image Comments//Unaliased MB3/PE4/LB SENSE1
-t[Operator]="0008,1070"
-t[Project]="0008,1030"       # ID Study Description//Brain^wpc-8620
-t[SequenceName]="0008,103e"  # series descripton
-t[SequenceType]="0018,0024"  # ACQ Sequence Name
-t[PED_major]="0018,1312"     #   ACQ Phase Encoding Direction, ROW or COL
-t[TR]="0018,0080"
-t[TE]="0018,0081"
-t[Matrix]="0018,1310"     # ACQ Acquisition Matrix
-t[PixelResol]="0028,0030" #  IMG Pixel Spacing//2.2978723049164\2.2978723049164
-# https://neurostars.org/t/how-is-bandwidthperpixelphaseencode-calculated/26526 (0021,1153)
-t[BWP]="0018,0095"        # ACQ Pixel Bandwidth (?)
-t[BWPPE]="0019,1028"      # in matlab S.BandwidthPerPixelPhaseEncode;
-t[FA]="0018,1314"        
-t[TA]="0051,100a"
-t[FoV]="0051,100c" # eg FoV 1617*1727; but actually cocaluated from matrix and spacing?
+export TAG_ARGS=$(cut -f2 taglist.txt | sed '1d;/#/d;s/^/-tag /;'|paste -sd' ')
+dcminfo(){
+ declare -g TAG_ARGS
+ #echo "# $1" >&2
+ gdcmdump -dC "$1" |
+   perl -ne 'BEGIN{%a=(Phase=>"NA", ucPAT=>"NA")}
+   $a{substr($1,0,5)} = $2 if m/(PhaseEncodingDirectionPositive.*Data..|ucPATMode\s+=\s+)(\d+)/;
+   END {print join("\t", @a{qw/Phase ucPAT/}), "\t"}'
+ dicom_hinfo -sepstr $'\t' -last -full_entry $TAG_ARGS "$@"
+}
 
-for d in  /Volumes/Hera/Raw/MRprojects/Habit/20*-*/1*_2*/dMRI_*/; do
-       	find  $d -maxdepth 1 -type f -print -quit
+export -f dcminfo
+
+cnt=0
+#for d in /Volumes/Hera/Raw/MRprojects/Habit/20*-*/1*_2*/dMRI_*/; do
+for d in  /Volumes/Hera/Raw/MRprojects/Habit/2022.08.23-14.24.18/11878_20220823/HabitTask_704x752.19/ /Volumes/Hera/Raw/MRprojects/Habit/2022.08.23-14.24.18/11878_20220823/dMRI_b0_AP_140x140.35/  /Volumes/Hera/Raw/MRprojects/Habit/2022.08.23-14.24.18/11878_20220823/Resting-state_ME_476x504.14/; do
+  echo "# $d" >&2
+  # just one dicom
+  find  $d -maxdepth 1 -type f -print -quit
+  let ++cnt
+  [ $cnt -gt 2 ] && break
 done |
-  xargs dicom_hinfo -sepstr $'\t' -last -full_entry $(printf " -tag %s" "${t[@]}") |
+  # TODO: use './dcmmeta2tsv.py' instead of dcminfo?
+  #xargs ./dcm2nii_check.bash |
+  parallel -n1 dcminfo |
   tee db.txt
diff --git a/01_txt_to_sqlite.py b/01_txt_to_sqlite.py
index 5735bcc1..c4e87ff4 100755
--- a/01_txt_to_sqlite.py
+++ b/01_txt_to_sqlite.py
@@ -1,7 +1,18 @@
 #!/usr/bin/env python3
+"""
+convert db.txt into a sqlite database
+"""
 import sqlite3
-# col names from 00_build_db.bash
-colnames = ["AcqTime", "AcqDate", "SeriesNumber", "SubID", "iPAT", "Comments", "Operator", "Project", "SequenceName", "SequenceType", "PED_major", "TR", "TE", "Matrix", "PixelResol", "BWP", "BWPPE", "FA", "TA", "FoV"]
+import re
+
+# CSA col names from 00_build_db.bash not in taglist.txt
+colnames = ["Phase", "iPAT"]
+with open('taglist.txt','r') as f:
+    tag_colnames = [line.split("\t")[0]
+                    for line in f.readlines()
+                    if not re.search("^name|^#", line)]
+colnames += tag_colnames
+colnames += ['filename'] # final file name column also not in taglist.txt (not a tag)
 
 sql = sqlite3.connect("db.sqlite") # see schema.sql
 
@@ -14,11 +25,12 @@
 with open('db.txt','r') as f:
     while line := f.readline():
         vals = line.split("\t")
-        d = {k:v for (k,v) in zip(colnames, vals)}
-        val_array = ",".join([d[k] for k in consts])
+        d = dict(zip(colnames, vals))
+        val_array = [d[k] for k in consts]
         print(val_array)
         sql.execute(sql_cmd, val_array)
-        break
+        continue
         # TODO: FIX ME
         last_row_id = sql.execute("SELECT id FROM acq_param WHERE  = ?;", ())
         sql.execute("insert into acq() values () ", (last_row_id))
+sql.commit()
diff --git a/dcmmeta2tsv.py b/dcmmeta2tsv.py
new file mode 100755
index 00000000..70da12a0
--- /dev/null
+++ b/dcmmeta2tsv.py
@@ -0,0 +1,53 @@
+#!/usr/bin/env python3
+"""
+Give a tab separated metadata value line per dicom file.
+"""
+import os
+import sys
+import re
+import pydicom
+#import warnings
+#warnings.filterwarnings("ignore", module="nibabel.nicom.csareader")
+import nibabel.nicom.csareader as csareader
+
+def tagpair_to_hex(csv_str):
+    """
+    move our text files has tags like "0051,1017"
+    to pydicom indexe like (0x51,0x1017)
+    """
+    return tuple(hex(int(x,16)) for x in csv_str.split(","))
+
+def read_known_tags(tagfile="taglist.txt"):
+    """
+    read in tsv file like with header name,tag,desc.
+    skip comments and header
+    """
+    with open(tagfile,'r') as f:
+        tags = [dict(zip(["name","tag","desc"],line.split("\t")))
+                        for line in f.readlines()
+                        if not re.search("^name|^#", line)]
+    return tags
+
+if __name__ == "__main__":
+    tags = read_known_tags()
+    for i in range(len(tags)):
+        tags[i]['tag'] = tagpair_to_hex(tags[i]['tag'])
+
+    for dcm_path in sys.argv[1:]:
+        if not os.path.isfile(dcm_path):
+            raise Exception("Bad command line argument: '{dcm_path}' DNE")
+        dcm = pydicom.dcmread(dcm_path)
+        meta = [dcm[tag_d['tag']].value for tag_d in tags]
+
+        csa_str = dcm[(0x0029,0x1010)].value
+        csa_tr = csareader.read(csa_str)
+        pedp = csa_tr['tags']['PhaseEncodingDirectionPositive']['items']
+        pedp = pedp[0] if pedp else "null"
+        ipat = csa_tr['tags']['ImaPATModeText']['items']
+        ipat = ipat[0] if ipat else "null"
+        # order here matches 00_build_db.bash
+        csa_tags =  [pedp, ipat]
+        # NB. arrays are '[x, y, z]' instead of ' x y z ' or 'x/y'
+        # like in dicom_hdr (00_build_db.bash)
+        all_tags =[str(x) for x in csa_tags + meta] + [dcm_path]
+        print("\t".join(all_tags))
diff --git a/taglist.txt b/taglist.txt
new file mode 100644
index 00000000..08d0f377
--- /dev/null
+++ b/taglist.txt
@@ -0,0 +1,22 @@
+name	tag	desc
+AcqTime	0008,0032	 Acquisition Time like 145446.685000
+AcqDate	0008,0022	 like 20241004
+SeriesNumber	0020,0011	 REL Series Number
+SubID	0010,0010	 patient name
+#iPAT	0051,1011	 PATModeText (private field); not implemented, use CSA value ucPAT
+Comments	0020,4000	REL Image Comments//Unaliased MB3/PE4/LB SENSE1
+Operator	0008,1070
+Project	0008,1030	 ID Study Description//Brain^wpc-8620
+SequenceName	0008,103e	 series descripton
+SequenceType	0018,0024	 ACQ Sequence Name
+PED_major	0018,1312	   ACQ Phase Encoding Direction, ROW or COL
+TR	0018,0080
+TE	0018,0081
+Matrix	0018,1310	 ACQ Acquisition Matrix
+PixelResol	0028,0030	  IMG Pixel Spacing//2.2978723049164\2.2978723049164
+#https://neurostars.org/t/how-is-bandwidthperpixelphaseencode-calculated/26526 (0021,1153)
+BWP	0018,0095	 ACQ Pixel Bandwidth (?) also unimplemented? need CSA value?
+BWPPE	0019,1028	 in matlab S.BandwidthPerPixelPhaseEncode;
+FA	0018,1314        
+TA	0051,100a
+FoV	0051,100c	 eg FoV 1617*1727; but actually cocaluated from matrix and spacing?