Handling the leading '#' in basis string

sunqm · Sep 20, 2024 · 46fcabb · 46fcabb
1 parent 943d39a
commit 46fcabb
Show file tree

Hide file tree

Showing 3 changed files with 28 additions and 11 deletions.
diff --git a/pyscf/gto/basis/parse_cp2k.py b/pyscf/gto/basis/parse_cp2k.py
@@ -58,7 +58,7 @@ def parse(string, symb=None, optimize=False):
             raise BasisNotFoundError(f'Basis not found for {symb}')
 
     bastxt = []
-    for dat in string.splitlines():
+    for dat in string:
         x = dat.split('#')[0].strip()
         if (x and not x.startswith('END') and not x.startswith('BASIS')):
             bastxt.append(x)
@@ -125,5 +125,4 @@ def search_seg(basisfile, symb):
     raw_basis = _search_basis_block(fdata[1:], symb)
     if not raw_basis:
         raise BasisNotFoundError(f'Basis for {symb} not found in {basisfile}')
-    return [x.strip() for x in raw_basis.splitlines()
-            if x.strip() and 'END' not in x]
+    return [x for x in raw_basis if x and 'END' not in x]
diff --git a/pyscf/gto/basis/parse_nwchem.py b/pyscf/gto/basis/parse_nwchem.py
@@ -86,7 +86,7 @@ def parse(string, symb=None, optimize=True):
             raise BasisNotFoundError('Basis not found for %s' % symb)
 
     raw_basis = []
-    for dat in string.splitlines():
+    for dat in string:
         dat = dat.split('#')[0].strip()  # Use # to start comments
         dat_upper = dat.upper()
         if (dat and not dat_upper.startswith('END') and not dat_upper.startswith('BASIS')):
@@ -154,16 +154,20 @@ def search_seg(basisfile, symb):
     with open(basisfile, 'r') as fin:
         fdata = re.split(BASIS_SET_DELIMITER, fin.read())
     raw_basis = _search_basis_block(fdata, symb)
-    return [x for x in raw_basis.splitlines() if x and 'END' not in x]
+    return [x for x in raw_basis if x and 'END' not in x]
 
 def _search_basis_block(raw_data, symb):
-    raw_basis = ''
     for dat in raw_data:
-        dat0 = dat.split(None, 1)
-        if dat0 and dat0[0] == symb:
-            raw_basis = dat
-            break
-    return raw_basis
+        basis_lines = dat.splitlines()
+        for line in basis_lines:
+            # Skip all leading '# xxx' lines and empty lines
+            if not line or line.lstrip()[0] == '#':
+                continue
+            elif line.split(None, 1)[0] == symb:
+                return [x.strip() for x in basis_lines]
+            else:
+                break
+    return []
 
 def convert_basis_to_nwchem(symb, basis):
     '''Convert the internal basis format to NWChem format string'''

diff --git a/pyscf/gto/test/test_basis_parser.py b/pyscf/gto/test/test_basis_parser.py
@@ -94,6 +94,20 @@ def test_parse_basis(self):
         basis_dat = gto.basis.parse_nwchem.parse(basis_str)
         self.assertEqual(len(basis_dat), 3)
 
+        basis_str = '''
+#BASIS SET: (3s) -> [1s]
+H    S
+     18.7311370     0.03349460
+      2.8253937     0.23472695
+      0.6401217     0.81375733
+#BASIS SET:
+#C    S
+#     1.5   1.
+C    SP
+      0.25  1.  1.'''
+      basis_dat = gto.basis.parse_nwchem.parse(basis_str, 'C')
+      self.assertEqual(len(basis_dat), 2)
+
     def test_parse_ecp(self):
         ecp_str = '''
 #