diff --git a/gcn_classic_text_to_json/conversion.py b/gcn_classic_text_to_json/conversion.py index fbb7eca..da16417 100644 --- a/gcn_classic_text_to_json/conversion.py +++ b/gcn_classic_text_to_json/conversion.py @@ -123,14 +123,18 @@ def text_to_json(notice, keywords_dict): notice_ra = keywords_dict["standard"]["ra"] ra_data = notice[notice_ra].split() - if ra_data[0] != "Undefined": + if ra_data[0] == "Undefined": + output["ra"] = None + else: output["ra"] = float(ra_data[0][:-1]) if "dec" in keywords_dict["standard"]: notice_dec = keywords_dict["standard"]["dec"] dec_data = notice[notice_dec].split() - if dec_data[0] != "Undefined": + if dec_data[0] == "Undefined": + output["dec"] = None + else: output["dec"] = float(dec_data[0][:-1]) if "additional" in keywords_dict: diff --git a/gcn_classic_text_to_json/notices/konus/README.md b/gcn_classic_text_to_json/notices/konus/README.md new file mode 100644 index 0000000..9b9fc83 --- /dev/null +++ b/gcn_classic_text_to_json/notices/konus/README.md @@ -0,0 +1,16 @@ +# KONUS Text Conversion + +Parses through the table in multiple webpages associated with KONUS triggers and creates `KONUS_{sernum}.json` directory in a `konus_jsons` inside an `output` directory for each trigger where sernum in an iterative number with no relation to the triggers. + +### Uses the following fields from the core schema for text notice fields +- `id` → Trig# +- `trigger_time` → Trig_Date, Trig_Time +- `classification` → Event + +### Defines the following new fields for the text notice fields +- `lightcurve_image_url` → GIF +- `lightcurve_textfile_url` → Text +- `detector_number` → Det + +## Caveats +- In the tables that I have been parsing, some of the fields are just empty. I've elected to skip these and not add the fields in the JSONs as that makes validation simpler. diff --git a/gcn_classic_text_to_json/notices/konus/__init__.py b/gcn_classic_text_to_json/notices/konus/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/gcn_classic_text_to_json/notices/konus/__main__.py b/gcn_classic_text_to_json/notices/konus/__main__.py new file mode 100644 index 0000000..575fcde --- /dev/null +++ b/gcn_classic_text_to_json/notices/konus/__main__.py @@ -0,0 +1,4 @@ +from . import conversion + +if __name__ == "__main__": + conversion.parse_all_konus_webpages() diff --git a/gcn_classic_text_to_json/notices/konus/conversion.py b/gcn_classic_text_to_json/notices/konus/conversion.py new file mode 100644 index 0000000..54ad494 --- /dev/null +++ b/gcn_classic_text_to_json/notices/konus/conversion.py @@ -0,0 +1,97 @@ +import json +import os +import re + +import requests +from bs4 import BeautifulSoup + + +def create_all_konus_jsons(link, sernum): + """Parses through the table of KONUS triggers in `link` to create their respective JSONs + and creates a konus_jsons directory inside an output directory. + + Parameters + ---------- + link: string + The link to be parsed. + sernum: int + An iterative number for saving the JSONs. This number has no relation with the data in the JSONs. + + Returns + ------- + sernum: int + returns sernum to be used in the next iteration of the function""" + output_path = "./output/konus_jsons/" + if not os.path.exists(output_path): + os.makedirs(output_path) + + file = requests.get(link) + data = file.text + + soup = BeautifulSoup(data, "html.parser") + + rows = soup.find_all("tr") + + for row in rows[1:]: + output_dict = { + "$schema": "https://gcn.nasa.gov/schema/main/gcn/notices/classic/konus/alert.schema.json" + } + + cols = row.find_all("td") + + trigger_date = cols[0].text.strip() + trigger_time = cols[2].text.split()[0] + output_dict["trigger_time"] = ( + f"{trigger_date[:4]}-{trigger_date[4:6]}-{trigger_date[-2:]}T{trigger_time}Z" + ) + + if cols[3].text != " " and cols[3].text != "" and cols[3].text != "\n": + output_dict["detector_number"] = int(cols[3].text) + + if cols[4].text: + output_dict["classification"] = {cols[4].text.strip(): 1} + + if cols[5].text: + output_dict["id"] = [int(cols[5].text.strip())] + + incomplete_image_link = cols[7].find("a").get("href") + output_dict["lightcurve_image_url"] = ( + f"https://gcn.gsfc.nasa.gov/{incomplete_image_link}" + ) + + incomplete_textfile_link = cols[9].find("a").get("href") + output_dict["lightcurve_textfile_url"] = ( + f"https://gcn.gsfc.nasa.gov/{incomplete_textfile_link}" + ) + + with open(f"{output_path}KONUS_{sernum}.json", "w") as f: + json.dump(output_dict, f) + sernum += 1 + + return sernum + + +def parse_all_konus_webpages(): + """The main konus webpage links to muliple webpages with more links. + This function finds them and calls create_all_konus_triggers for each""" + + main_link = "https://gcn.gsfc.nasa.gov/konus_grbs.html" + file = requests.get(main_link) + data = file.text + + soup = BeautifulSoup(data, "html.parser") + + search_string = re.compile("grbs.html") + html_tags = soup.find_all("a", attrs={"href": search_string}) + + html_links = [] + + for tag in html_tags: + incomplete_link = tag.get("href") + html_links.append(f"https://gcn.gsfc.nasa.gov/{incomplete_link}") + + html_links.append(main_link) + + sernum = 1 + for link in html_links: + sernum = create_all_konus_jsons(link, sernum) diff --git a/gcn_classic_text_to_json/notices/near/README.md b/gcn_classic_text_to_json/notices/near/README.md new file mode 100644 index 0000000..9cf0c21 --- /dev/null +++ b/gcn_classic_text_to_json/notices/near/README.md @@ -0,0 +1,10 @@ +# NEAR Text Conversion + +Parses through the table in multiple webpages associated with NEAR triggers and creates `NEAR_{sernum}.json` directory in a `near_jsons` inside an `output` directory for each trigger where sernum in an iterative number with no relation to the triggers. + +### Uses the following fields from the core schema for text notice fields +- `trigger_time` → None given in the webpage + +### Defines the following new fields for the text notice fields +- `lightcurve_image_url` → None given in the webpage +- `lightcurve_textfile_url` → None given in the webpage diff --git a/gcn_classic_text_to_json/notices/near/__init__.py b/gcn_classic_text_to_json/notices/near/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/gcn_classic_text_to_json/notices/near/__main__.py b/gcn_classic_text_to_json/notices/near/__main__.py new file mode 100644 index 0000000..eaa7074 --- /dev/null +++ b/gcn_classic_text_to_json/notices/near/__main__.py @@ -0,0 +1,4 @@ +from . import conversion + +if __name__ == "__main__": + conversion.parse_all_near_triggers() diff --git a/gcn_classic_text_to_json/notices/near/conversion.py b/gcn_classic_text_to_json/notices/near/conversion.py new file mode 100644 index 0000000..c11d146 --- /dev/null +++ b/gcn_classic_text_to_json/notices/near/conversion.py @@ -0,0 +1,103 @@ +import json +import os +import re + +import requests +from bs4 import BeautifulSoup + + +def create_near_jsons(link, sernum): + """Parses through the table in `link` and creates JSONs for each row. + Then and creates a near_jsons directory inside an output directory + + Parameters + ---------- + link: string + The link to be parsed. + sernum: int + An iterative number for saving the JSONs. This number has no relation with the data in the JSONs. + + Returns + ------- + sernum: int + returns sernum to be used in the next iteration of the function""" + output_path = "./output/near_jsons/" + if not os.path.exists(output_path): + os.makedirs(output_path) + + file = requests.get(link) + data = file.text + + start_idx = data.find("") + start_idx = data.find("
  • ", start_idx) + + while start_idx != -1: + output_dict = { + "$schema": "https://gcn.nasa.gov/schema/main/gcn/notices/classic/near/alert.schema.json" + } + end_idx = data.find("\n", start_idx) + + row_data = data[start_idx:end_idx].split() + + trigger_date_data = row_data[1] + trigger_time = row_data[4] + + if trigger_date_data[:2] == "99": + output_dict["trigger_time"] = ( + f"19{trigger_date_data[:2]}-{trigger_date_data[2:4]}-{trigger_date_data[-2:]}T{trigger_time}Z" + ) + else: + output_dict["trigger_time"] = ( + f"20{trigger_date_data[:2]}-{trigger_date_data[2:4]}-{trigger_date_data[-2:]}T{trigger_time}Z" + ) + + postscript_url_start_idx = data.find("", jpeg_url_start_idx) + textfile_url_start_idx = data.find("", textfile_url_start_idx) + + jpeg_url_incomplete = data[jpeg_url_start_idx:jpeg_url_end_idx] + jpeg_url = f"https://gcn.gsfc.nasa.gov/{jpeg_url_incomplete}" + + textfile_url_incomplete = data[textfile_url_start_idx:textfile_url_end_idx] + textfile_url = f"https://gcn.gsfc.nasa.gov/{textfile_url_incomplete}" + + output_dict["lightcurve_image_url"] = f"https://gcn.gsfc.nasa.gov/{jpeg_url}" + + output_dict["lightcurve_textfile_url"] = ( + f"https://gcn.gsfc.nasa.gov/{textfile_url}" + ) + + with open(f"{output_path}NEAR_{sernum}.json", "w") as f: + json.dump(output_dict, f) + + sernum += 1 + start_idx = data.find("
  • ", end_idx) + + return sernum + + +def parse_all_near_triggers(): + """The main near webpage links to muliple webpages with more links. + This function finds them and calls create_all_konus_triggers for each""" + main_link = "https://gcn.gsfc.nasa.gov/near_grbs.html" + file = requests.get(main_link) + data = file.text + + soup = BeautifulSoup(data, "html.parser") + + search_string = re.compile("grbs.html") + html_tags = soup.find_all("a", attrs={"href": search_string}) + + html_links = [] + + for tag in html_tags: + incomplete_link = tag.get("href") + html_links.append(f"https://gcn.gsfc.nasa.gov/{incomplete_link}") + + html_links.append(main_link) + + sernum = 1 + for link in html_links: + sernum = create_near_jsons(link, sernum)