diff --git a/.env.example b/.env.example index 96dd446..55d6a09 100644 --- a/.env.example +++ b/.env.example @@ -1,4 +1,4 @@ # required environment variables MONGO_CONNECTION_URI=mongodb://localhost:27017 -OPEN_DATA_SERICE_KEY=some_key +OPEN_DATA_SERVICE_KEY=some_key MONGO_DATABASE=council \ No newline at end of file diff --git a/API/MongoDB.py b/API/MongoDB.py index 261334b..29cdc0b 100644 --- a/API/MongoDB.py +++ b/API/MongoDB.py @@ -15,6 +15,7 @@ class Councilor: job: str eduId: int edu: str + year: str @classmethod def from_dict(cls, data: dict): @@ -31,6 +32,7 @@ def from_dict(cls, data: dict): job=data.get("job"), eduId=int(data.get("eduId")), edu=data.get("edu"), + year=data.get("year"), ) def to_dict(self): @@ -47,4 +49,5 @@ def to_dict(self): "job": self.job, "eduId": self.eduId, "edu": self.edu, + "year": self.year, } diff --git a/API/__init__.py b/API/__init__.py index 6ce192c..793d9d7 100644 --- a/API/__init__.py +++ b/API/__init__.py @@ -30,13 +30,14 @@ "8": CouncilType.METROPOLITAN_COUNCIL, "9": CouncilType.LOCAL_COUNCIL, } + CANDIDATE_TYPECODE_TYPE = { "2": CouncilType.NATIONAL_COUNCIL_CAND, # "3": CouncilType.METRO_LEADER_CAND, # "4": CouncilType.LOCAL_LEADER_CAND, "5": CouncilType.METROPOLITAN_COUNCIL_CAND, "6": CouncilType.LOCAL_COUNCIL_CAND, - "7": CouncilType.NATIONAL_COUNCIL_CAND, + "7": CouncilType.NATIONAL_COUNCIL_GLOBAL_CAND, "8": CouncilType.METROPOLITAN_COUNCIL_CAND, "9": CouncilType.LOCAL_COUNCIL_CAND, } diff --git a/API/candidate.py b/API/candidate.py index 804ec27..c00afda 100644 --- a/API/candidate.py +++ b/API/candidate.py @@ -68,28 +68,36 @@ def fetch_all_data( parser.add_argument( "--drop-columns", type=str, - default="num,huboid,hanjaName,status", + default="num,huboid,hanjaName,status,gihoSangse", help="제거할 열 이름을 ','로 구분하여 입력하세요", ) parser.add_argument( - "--save-method", - type=str, - # TODO: Add MongoDB support - # choices=["excel", "mongo"], - choices=["excel"], - default="excel", - help="데이터 저장 방식: 'excel' (현재는 excel만 지원)", + "-m", "--update-mongo", help="API 요청 결과를 MongoDB에 업데이트", action="store_true" + ) + parser.add_argument( + "-o", "--output-store", help="API 요청 결과를 로컬에 저장", action="store_true" + ) + parser.add_argument("--output-path", help="API 요청 결과 저장 경로", default="output") + + args = vars(parser.parse_args()) + print(args) + sgIds = args.get("sgIds").split(",") + if args.get("drop_columns"): + drop_columns = args.get("drop_columns").split(",") + else: + drop_columns = [] + print(drop_columns) + + data_list = fetch_all_data( + sgIds, args.get("sgTypecodes"), drop_columns=drop_columns ) - args = parser.parse_args() - sgIds = args.sgIds.split(",") - drop_columns = args.drop_columns.split(",") if args.drop_columns else [] - - data_list = fetch_all_data(sgIds, args.sgTypecodes, drop_columns=drop_columns) - for sgTypecode in args.sgTypecodes.split(","): + for sgTypecode in args.get("sgTypecodes").split(","): if sgTypecode not in SG_TYPECODE: raise ValueError(f"Invalid sgTypecode: {sgTypecode}") - if args.save_method == "excel": - save_to_excel(data_list, sgTypecode, is_elected=False) - elif args.save_method == "mongo": + + if args.get("update_mongo"): save_to_mongo(data_list, sgTypecode, CANDIDATE_TYPECODE_TYPE[sgTypecode]) + + if args.get("output_store"): + save_to_excel(data_list, sgTypecode, is_elected=False) diff --git a/API/elected.py b/API/elected.py index 7fe299e..5e13669 100644 --- a/API/elected.py +++ b/API/elected.py @@ -41,6 +41,7 @@ def fetch_data( data_list = [] for item in root.findall(".//item"): data_entry = {child.tag: child.text for child in item} + data_entry["year"] = sgId[:4] for column in drop_columns: data_entry.pop(column) @@ -72,22 +73,30 @@ def fetch_all_data( help="제거할 열 이름을 ','로 구분하여 입력하세요", ) parser.add_argument( - "--save-method", - type=str, - choices=["excel", "mongo"], - default="excel", - help="데이터 저장 방식: 'excel', 'mongo'", + "-m", "--update-mongo", help="API 요청 결과를 MongoDB에 업데이트", action="store_true" + ) + parser.add_argument( + "-o", "--output-store", help="API 요청 결과를 로컬에 저장", action="store_true" ) + parser.add_argument("--output-path", help="API 요청 결과 저장 경로", default="output") - args = parser.parse_args() - sgIds = args.sgIds.split(",") - drop_columns = args.drop_columns.split(",") if args.drop_columns else [] + args = vars(parser.parse_args()) + sgIds = args.get("sgIds").split(",") + if args.get("drop_columns"): + drop_columns = args.get("drop_columns").split(",") + else: + drop_columns = [] + + data_list = fetch_all_data( + sgIds, args.get("sgTypecodes"), drop_columns=drop_columns + ) - data_list = fetch_all_data(sgIds, args.sgTypecodes, drop_columns=drop_columns) - for sgTypecode in args.sgTypecodes.split(","): + for sgTypecode in args.get("sgTypecodes").split(","): if sgTypecode not in SG_TYPECODE: raise ValueError(f"Invalid sgTypecode: {sgTypecode}") - if args.save_method == "excel": - save_to_excel(data_list, sgTypecode, is_elected=True) - elif args.save_method == "mongo": + + if args.get("update_mongo"): save_to_mongo(data_list, sgTypecode, ELECTED_TYPECODE_TYPE[sgTypecode]) + + if args.get("output_store"): + save_to_excel(data_list, sgTypecode, is_elected=True) diff --git a/API/utils.py b/API/utils.py index 71ac2c3..57d8865 100644 --- a/API/utils.py +++ b/API/utils.py @@ -40,15 +40,18 @@ def save_to_mongo(data: List[dict], sgTypecode: str, where: str) -> None: db = client["council"] main_collection = db[where] - # TODO: Support other types of test - if sgTypecode in ["6", "9"]: + # TODO: Support other types of councils + if sgTypecode in ["8", "5", "2", "6", "9"]: for entry in data: + if entry["wiwName"] == None: + print(entry) entry["wiwName"] = change_local_name(entry["sdName"], entry["wiwName"]) district_id = get_district_id(entry["sdName"], entry["wiwName"]) if district_id: main_collection.update_one( { + "year": entry["year"], "name": entry["name"], "localId": district_id["localId"], "metroId": district_id["metroId"], @@ -60,34 +63,21 @@ def save_to_mongo(data: List[dict], sgTypecode: str, where: str) -> None: print( f"Warning: '{entry['sdName']} {entry['wiwName']}'에 해당하는 지역 ID가 존재하지 않습니다." ) - elif sgTypecode in ["5", "8"]: - main_collection = db["metro_councilor"] + elif sgTypecode in ["7"]: for entry in data: - entry["wiwName"] = change_local_name(entry["sdName"], entry["wiwName"]) - district_id = get_district_id(entry["sdName"], entry["wiwName"]) - - if not district_id: - print( - f"Warning: '{entry['sdName']} {entry['wiwName']}'에 해당하는 지역 ID가 존재하지 않습니다." - ) - continue - - if district_id: - main_collection.update_one( - { - "name": entry["name"], - "local_id": district_id["local_id"], - "metro_id": district_id["metro_id"], - }, - {"$set": Councilor.from_dict(entry).to_dict()}, - upsert=True, - ) - else: - print( - f"Warning: '{entry['sdName']} {entry['wiwName']}'에 해당하는 지역 ID가 존재하지 않습니다." - ) + entry["wiwName"] = "전국" + main_collection.update_one( + { + "year": entry["year"], + "name": entry["name"], + "localId": 0, + "metroId": 0, + }, + {"$set": Councilor.from_dict(entry).to_dict()}, + upsert=True, + ) else: - raise NotImplementedError("현재 구시군의회의원(6) 및 기초의원비례대표(9)만 구현되어 있습니다.") + raise NotImplementedError(f"아직 구현되지 않은 sgTypecode: {sgTypecode}") print(f"데이터를 성공적으로 MongoDB '{main_collection.name}' 컬렉션에 저장하였습니다.") @@ -124,29 +114,6 @@ def getLocalMetroMap() -> Dict[str, str]: } -def getLocalMetroMap() -> Dict[str, str]: - db = client["district"] - result = db["local_district"].aggregate( - [ - { - "$project": { - "localId": 1, - "metroId": 1, - "sdName": 1, - "wiwName": 1, - } - }, - ] - ) - return { - (item["sdName"], item["wiwName"]): { - "local_id": item["localId"], - "metro_id": item["metroId"], - } - for item in result - } - - def change_local_name(sdName, wiwName): """ 1. 만약 '시' 와 '구'가 모두 wiwName에 있다면, '시' 까지만 쓰기 diff --git a/README.md b/README.md index b841c38..6452ced 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,5 @@ # 다양성 평가 리포트 웹사이트 - 뉴웨이즈 + + ## 프로젝트 개요 -프로젝트 이름 다양성 평가 리포트 웹사이트 - 뉴웨이즈 -기간 23 가을-겨울 + +프로젝트 이름 다양성 평가 리포트 웹사이트 - 뉴웨이즈 +기간 23 가을-겨울 ## 설치 및 실행 과정 + 1. 파이썬 가상환경 생성 - - 아래 명령을 실행하여 파이썬 가상환경을 생성합니다. - ```bash - cd ~ && virtualenv newways --python=3.10 - ``` + - 아래 명령을 실행하여 파이썬 가상환경을 생성합니다. + ```bash + cd ~ && virtualenv newways --python=3.10 + ``` 2. 가상환경 활성화 - - 아래 명령을 실행하여 가상환경을 활성화합니다. - ```bash - source ~/newways/bin/activate - ``` + - 아래 명령을 실행하여 가상환경을 활성화합니다. + ```bash + source ~/newways/bin/activate + ``` 3. 레포지토리 클론 - 아래 명령을 실행하여 레포지토리를 클론합니다. ```bash git clone https://github.com/NewWays-TechForImpactKAIST/main - ``` + ``` 4. 필요한 패키지 설치 - requirements.txt에 명시된 패키지를 설치합니다. ```bash pip install -r requirements.txt - ``` + ``` 5. 환경 변수 설정 - `.env.example` 파일을 복사하여 `.env` 파일을 생성합니다. ```bash cp .env.example .env - ``` - - `.env` 파일을 열어 환경 변수의 값을 필요에 따라 바꾸어줍니다. + ``` + - `.env` 파일을 열어 환경 변수의 값을 필요에 따라 바꾸어줍니다. 6. 예제 코드 실행 - 이 프로젝트는 여러 개의 파이썬 패키지로 구성되어 있습니다. - 각각의 패키지는 독립적으로 실행할 수 있습니다. 단, 실행 시 python -m 옵션(module을 의미)을 사용해야 합니다. - 크롤링 및 데이터베이스 저장 예제 코드를 실행하려면, 아래 명령을 실행합니다. - ```bash - # scrap/local_councils/seoul/junggu.py 파일을 실행합니다. - python -m scrap.local_councils.seoul.junggu - # scrap/examples/database.py 파일을 실행합니다. - python -m scrap.examples.database - ``` \ No newline at end of file + ```bash + # scrap/local_councils/seoul/junggu.py 파일을 실행합니다. + python -m scrap.local_councils.seoul.junggu + # scrap/examples/database.py 파일을 실행합니다. + python -m scrap.examples.database + ``` diff --git a/analysis/age/hist_groups.py b/analysis/age/hist_groups.py index 94a5cda..b1beee9 100644 --- a/analysis/age/hist_groups.py +++ b/analysis/age/hist_groups.py @@ -181,7 +181,7 @@ def insert_data_to_mongo( ) -def cluster(df_original, n_clst, basedic): +def cluster(df_original, n_clst, basedic, clean_flag=True): """구역별 그룹을 만듭니다. df_original: 데이터프레임 n_clst: 그룹 수 @@ -194,11 +194,11 @@ def cluster(df_original, n_clst, basedic): histcoll = statdb["age_hist"] statcoll = statdb["age_stat"] # method = "equal"에서 써 줄 통계. # 기존 histogram 정보는 삭제 (나이별로 넣는 것이기 때문에 찌꺼기값 존재가능) - histcoll.delete_many(basedic.__dict__) - if basedic.method == "equal": - statcoll.delete_many(basedic.__dict__) + if clean_flag: + histcoll.delete_many(basedic.__dict__) + if basedic.method == "equal": + statcoll.delete_many(basedic.__dict__) # 연도별로 데이터 찾아서 넣기! - df_original["year"] = df_original["sgId"] // 10000 df_original = df_original[df_original["year"].isin([2010, 2014, 2018, 2022])] years = df_original["year"].unique() for year in years: diff --git a/analysis/age/main.py b/analysis/age/main.py index bc562ba..a766556 100644 --- a/analysis/age/main.py +++ b/analysis/age/main.py @@ -6,6 +6,7 @@ from analysis.age.most_common_age_group import most_common_age_group from analysis.age.hist_groups import cluster from analysis.age import BasicArgument +from db.client import client # 경고 무시 warnings.filterwarnings("ignore", category=FutureWarning) @@ -23,9 +24,10 @@ "기초의원비례대표": "local_councilor", } +personDB = client["council"] -def run(cluster_by, filenames, N=5, folder_name="To_be_filled"): - ## TO-DO: excel말고 mongodb에서 받아오도록 합니다. + +def run_by_excel(cluster_by, filenames, N=5, folder_name="To_be_filled"): assert cluster_by in ["sdName", "wiwName"] level = 1 if cluster_by == "sdName" else 2 datadir = os.path.join(BASE_DIR, "_data", folder_name) @@ -38,6 +40,7 @@ def run(cluster_by, filenames, N=5, folder_name="To_be_filled"): else: df = df[["sgId", "sdName", "wiwName", "name", "age", "gender"]] df = df.sort_values(by="age") + df["year"] = df["sgId"] // 10000 is_elected = ( True if "당선" in d @@ -56,13 +59,67 @@ def run(cluster_by, filenames, N=5, folder_name="To_be_filled"): cluster(df, N, basedic) +# def main(N=5): +# run_by_excel("sdName", ["[당선][시도의원].xlsx", "[당선][광역의원비례대표].xlsx"]) +# run_by_excel("sdName", ["[후보][시도의원].xlsx", "[후보][광역의원비례대표].xlsx"]) +# run_by_excel("sdName", ["[당선][구시군의회의원].xlsx", "[당선][기초의원비례대표].xlsx"]) +# run_by_excel("sdName", ["[후보][구시군의회의원].xlsx", "[후보][기초의원비례대표].xlsx"]) +# run_by_excel("wiwName", ["[당선][구시군의회의원].xlsx", "[당선][기초의원비례대표].xlsx"]) +# run_by_excel("wiwName", ["[후보][구시군의회의원].xlsx", "[후보][기초의원비례대표].xlsx"]) + + +def run_by_mongo(cluster_by, is_elected, councilorType, N=5): + assert cluster_by in ["sdName", "wiwName"] + level = 1 if cluster_by == "sdName" else 2 + data = [] + if not is_elected: + councilorType = councilorType + "_candidate" + cursor = personDB[councilorType].find() + if level == 1: + for person in cursor: + data.append( + { + "year": person.get("year"), + "sdName": person.get("sdName"), + "name": person.get("name"), + "age": person.get("age"), + "gender": person.get("gender"), + } + ) + else: + for person in cursor: + data.append( + { + "year": person.get("year"), + "sdName": person.get("sdName"), + "wiwName": person.get("wiwName"), + "name": person.get("name"), + "age": person.get("age"), + "gender": person.get("gender"), + } + ) + + df = pd.DataFrame(data) + df = df.sort_values(by="age") + + for method in ["kmeans", "equal"]: + basedic = BasicArgument( + councilorType=councilorType, + is_elected=is_elected, + level=level, + method=method, + ) + cluster(df, N, basedic, clean_flag=True) + + def main(N=5): - run("sdName", ["[당선][시도의원].xlsx", "[당선][광역의원비례대표].xlsx"]) - run("sdName", ["[후보][시도의원].xlsx", "[후보][광역의원비례대표].xlsx"]) - run("sdName", ["[당선][구시군의회의원].xlsx", "[당선][기초의원비례대표].xlsx"]) - run("sdName", ["[후보][구시군의회의원].xlsx", "[후보][기초의원비례대표].xlsx"]) - run("wiwName", ["[당선][구시군의회의원].xlsx", "[당선][기초의원비례대표].xlsx"]) - run("wiwName", ["[후보][구시군의회의원].xlsx", "[후보][기초의원비례대표].xlsx"]) + # 세종시의 경우 어느 순간 승급하기 때문에 sdName을 먼저 해야, sdName이 cluster 시작 때 밀려도 괜챃다. (cluster 함수 참조) + run_by_mongo("sdName", is_elected=True, councilorType="metro_councilor") + run_by_mongo("sdName", is_elected=False, councilorType="metro_councilor") + run_by_mongo("sdName", is_elected=True, councilorType="local_councilor") + run_by_mongo("sdName", is_elected=False, councilorType="local_councilor") + run_by_mongo("wiwName", is_elected=True, councilorType="local_councilor") + run_by_mongo("wiwName", is_elected=False, councilorType="local_councilor") main() diff --git a/configurations/secrets.py b/configurations/secrets.py index 5a72cd4..63b0c09 100644 --- a/configurations/secrets.py +++ b/configurations/secrets.py @@ -26,7 +26,7 @@ class OpenDataPortalSecrets: 공공데이터포털(data.go.kr) API 호출에 필요한 서비스 키를 정의합니다. """ - service_key = str(os.getenv("OPEN_DATA_SERICE_KEY") or "") + service_key = str(os.getenv("OPEN_DATA_SERVICE_KEY") or "") class EmailSecrets: @@ -37,3 +37,11 @@ class EmailSecrets: sender_email = str(os.getenv("SCRAP_SENDER_EMAIL") or "") receiver_email = str(os.getenv("SCRAP_RECEIVER_EMAIL") or "") password = str(os.getenv("SCRAP_EMAIL_PASSWORD") or "") + + +class WebhookSecrets: + """ + 스크랩 결과 웹훅 전송에 필요한 키를 정의합니다. + """ + + webhook_url = str(os.getenv("WEBHOOK_URL") or "") diff --git a/db/types.py b/db/types.py index c65711c..560b06c 100644 --- a/db/types.py +++ b/db/types.py @@ -14,6 +14,8 @@ class CouncilType(str, Enum): NATIONAL_COUNCIL = "national_councilor" NATIONAL_COUNCIL_CAND = "national_councilor_candidate" + NATIONAL_COUNCIL_GLOBAL = "national_councilor_global" + NATIONAL_COUNCIL_GLOBAL_CAND = "national_councilor_global_candidate" METROPOLITAN_COUNCIL = "metropolitan_councilor" METROPOLITAN_COUNCIL_CAND = "metropolitan_councilor_candidate" diff --git a/scrap/local_councils/basic.py b/scrap/local_councils/basic.py index d4ac5e6..42d688b 100644 --- a/scrap/local_councils/basic.py +++ b/scrap/local_councils/basic.py @@ -124,9 +124,6 @@ def sel_getname(profile, element, class_, wrapper_element, wrapper_class_): name = strong_element.text.strip() except NoSuchElementException: pass - # print(name+"\n") - if name == "": - return "231114" name = name.split("(")[0].split(":")[-1].strip() # 이름 뒷 한자이름, 앞 '이 름:' 제거 # TODO : 만약 이름이 우연히 아래 단어를 포함하는 경우를 생각해볼만 함. if len(name) > 3: @@ -154,7 +151,9 @@ def extract_party(string): return None -def goto_profilesite(profile, wrapper_element, wrapper_class_, wrapper_txt, url): +def goto_profilesite( + profile, wrapper_element, wrapper_class_, wrapper_txt, url, inner_euckr=False +): # 의원 프로필에서 프로필보기 링크를 가져옴 parsed_url = urlparse(url) base_url = f"{parsed_url.scheme}://{parsed_url.netloc}" @@ -169,7 +168,10 @@ def goto_profilesite(profile, wrapper_element, wrapper_class_, wrapper_txt, url) # base_url = base_url + '/' profile_url = base_url + profile_link["href"] try: - profile = get_soup(profile_url, verify=False) + if inner_euckr: + profile = get_soup(profile_url, verify=False, encoding="euc-kr") + else: + profile = get_soup(profile_url, verify=False) except Exception: raise RuntimeError("[basic.py] '//'가 있진 않나요?", " url: ", profile_url) return profile @@ -212,7 +214,7 @@ def getpty(profile, element, class_, wrapper_element, wrapper_class_, wrapper_tx ) ) if party_pulp_list == []: - raise RuntimeError("[basic.py] 정당정보 regex 실패") + raise Exception("[basic.py] 정당정보 regex 실패") party_pulp = party_pulp_list[0] party_string = party_pulp.get_text(strip=True).split(" ")[-1] while True: @@ -221,14 +223,16 @@ def getpty(profile, element, class_, wrapper_element, wrapper_class_, wrapper_tx if (party_pulp := party_pulp.find_next("span")) is not None: party_string = party_pulp.text.strip().split(" ")[-1] else: - return "[basic.py] 정당 정보 파싱 불가" + raise Exception("[basic.py] 정당 정보 파싱 불가") -def getpty_easy(profile, wrapper_element, wrapper_class_, wrapper_txt, url): +def getpty_easy( + profile, wrapper_element, wrapper_class_, wrapper_txt, url, inner_euckr=False +): # 의원 프로필에서 의원이 몸담는 정당 이름을 가져옴 if wrapper_element is not None: profile = goto_profilesite( - profile, wrapper_element, wrapper_class_, wrapper_txt, url + profile, wrapper_element, wrapper_class_, wrapper_txt, url, inner_euckr ) party = extract_party(profile.text) assert party is not None @@ -253,7 +257,9 @@ def sel_getpty_easy( return party -def scrap_basic(url, cid, args: ScrapBasicArgument, encoding="utf-8") -> ScrapResult: +def scrap_basic( + url, cid, args: ScrapBasicArgument, encoding="utf-8", inner_euckr=False +) -> ScrapResult: """의원 상세약력 스크랩 :param url: 의원 목록 사이트 url :param cid: 의회 id @@ -293,7 +299,12 @@ def scrap_basic(url, cid, args: ScrapBasicArgument, encoding="utf-8") -> ScrapRe except Exception as e: try: party = getpty_easy( - profile, args.pty_wrapelt, args.pty_wrapcls, args.pty_wraptxt, url + profile, + args.pty_wrapelt, + args.pty_wrapcls, + args.pty_wraptxt, + url, + inner_euckr, ) except Exception: raise RuntimeError("[basic.py] 의원 정당을 가져오는데 실패했습니다. 이유: " + str(e)) @@ -331,8 +342,6 @@ def sel_scrap_basic(url, cid, args: ScrapBasicArgument) -> ScrapResult: raise RuntimeError( "[basic.py/selenium] 의원 이름을 가져오는데 실패했습니다. 이유 : " + str(e) ) - if name == "231114": - continue try: party = sel_getpty_easy( profile, args.pty_wrapelt, args.pty_wrapcls, args.pty_wraptxt, url diff --git a/scrap/local_councils/gyeongsang.py b/scrap/local_councils/gyeongsang.py index 230735b..693bcbf 100644 --- a/scrap/local_councils/gyeongsang.py +++ b/scrap/local_councils/gyeongsang.py @@ -10,6 +10,9 @@ regex_pattern, ) +party_keywords = getPartyList() +party_keywords.append("무소속") + def scrap_186( url, @@ -123,6 +126,34 @@ def scrap_191( return ret_local_councilors(cid, councilors) +def scrap_192( + url, + cid, + args: ArgsType = None, +) -> ScrapResult: + """경상북도 구미시""" + soup = get_soup(url, verify=False, encoding="euc-kr") + councilors: List[Councilor] = [] + for profile in soup.find_all("div", class_="profile"): + name_tag = profile.find("li", class_="name") + name = name_tag.get_text(strip=True).split("(")[0] if name_tag else "이름 정보 없음" + + party = "정당 정보 없음" + profile_link = profile.find_all("a")[1] + parsed_url = urlparse(url) + base_url = f"{parsed_url.scheme}://{parsed_url.netloc}" + profile_url = base_url + profile_link["href"] + profile = get_soup(profile_url, verify=False, encoding="euc-kr") + party = "" + for keyword in party_keywords: + if keyword in profile.text: + party = keyword + break + councilors.append(Councilor(name=name, jdName=party)) + + return ret_local_councilors(cid, councilors) + + def scrap_194( url, cid, @@ -188,6 +219,26 @@ def scrap_196( return ret_local_councilors(cid, councilors) +def scrap_197( + url, + cid, + args: ArgsType = None, +) -> ScrapResult: + """경상북도 경산시""" + soup = get_soup(url, verify=False, encoding="euc-kr") + councilors: List[Councilor] = [] + for profile in soup.find_all("div", class_="memberL") + soup.find_all( + "div", class_="memberR" + ): + party = profile.find_previous("h4", class_="title").text.strip() + assert party in party_keywords + name = profile.find("dt").text.strip() + + councilors.append(Councilor(name=name, jdName=party)) + + return ret_local_councilors(cid, councilors) + + def scrap_198( url, cid, @@ -216,18 +267,18 @@ def scrap_199( args: ArgsType = None, ) -> ScrapResult: """경상북도 고령군""" - soup = get_soup(url, verify=False) - councilors: List[Councilor] = [] - for profile in soup.find_all("div", class_="profile"): - name_tag = profile.find("em", class_="name") - name = name_tag.get_text(strip=True).split("\r")[0] if name_tag else "이름 정보 없음" - + browser = get_selenium(url) + councilors: list[Councilor] = [] + for profile in browser.find_elements(By.CSS_SELECTOR, "div[class='profile']"): + name_tag = profile.find_element(By.CSS_SELECTOR, "em[class='name']") + name = name_tag.text.strip().split("\r")[0] if name_tag else "이름 정보 없음" + party = "" + for keyword in party_keywords: + if keyword in profile.text: + party = keyword + break party = "정당 정보 없음" - party_info = profile.find("em", string="정 당 : ") - if party_info: - party = party_info.find_next("span").get_text(strip=True) - - councilors.append(Councilor(name=name, jdName=party)) + councilors.append(Councilor(name, party)) return ret_local_councilors(cid, councilors) @@ -260,6 +311,32 @@ def scrap_201( return ret_local_councilors(cid, councilors) +def scrap_202( + url, + cid, + args: ArgsType = None, +) -> ScrapResult: + """경상북도 군위군""" + soup = get_soup(url, verify=False, encoding="euc-kr") + councilors: List[Councilor] = [] + for profile in soup.find_all("div", class_="profile"): + name_tag = profile.find("li", class_="name") + name = name_tag.get_text(strip=True).split("(")[0] if name_tag else "이름 정보 없음" + link = profile.find("p", class_="btn").find("a")["href"] + parsed_url = urlparse(url) + base_url = f"{parsed_url.scheme}://{parsed_url.netloc}" + profile_url = base_url + link + profile = get_soup(profile_url, verify=False, encoding="euc-kr") + party = "" + for keyword in party_keywords: + if keyword in profile.text: + party = keyword + break + councilors.append(Councilor(name=name, jdName=party)) + + return ret_local_councilors(cid, councilors) + + def scrap_203( url, cid, @@ -282,6 +359,57 @@ def scrap_203( return ret_local_councilors(cid, councilors) +def scrap_204( + url, + cid, + args: ArgsType = None, +) -> ScrapResult: + """경상북도 청송군""" + soup = get_soup(url, verify=False) + councilors: List[Councilor] = [] + for profile in soup.find_all("div", class_="box3vm1"): + name_tag = profile.find("span", class_="t3") + name = name_tag.get_text(strip=True).split()[-1] if name_tag else "이름 정보 없음" + link = profile.find("a", class_="button")["href"] + parsed_url = urlparse(url) + base_url = f"{parsed_url.scheme}://{parsed_url.netloc}" + profile_url = base_url + link + profile = get_soup(profile_url, verify=False) + link = profile.find("a", text="의원소개", href=True) + profile_url = base_url + link["href"] + profile = get_soup(profile_url, verify=False) + + party = "" + for keyword in party_keywords: + if keyword in profile.text: + party = keyword + break + councilors.append(Councilor(name=name, jdName=party)) + + return ret_local_councilors(cid, councilors) + + +def scrap_205( + url, + cid, + args: ArgsType = None, +) -> ScrapResult: + """경상북도 영양군""" + # TODO : gzip 문제 생기니, selenium으로 대체 + print(url) + soup = get_soup(url, verify=False) + councilors: List[Councilor] = [] + profile_list = soup.find("div", id="content_box") + for name_tag in profile_list.find_all("h3"): + name = name_tag.get_text(strip=True).split("(")[0] if name_tag else "이름 정보 없음" + ul = name_tag.find_next("ul") + li_party = ul.find("li", string="소속정당") + party = li_party.text.split(" : ")[-1].strip() + councilors.append(Councilor(name=name, jdName=party)) + + return ret_local_councilors(cid, councilors) + + def scrap_206( url, cid, diff --git a/scrap/local_councils/jeolla.py b/scrap/local_councils/jeolla.py index 7301ed5..cc7dee3 100644 --- a/scrap/local_councils/jeolla.py +++ b/scrap/local_councils/jeolla.py @@ -7,9 +7,13 @@ extract_party, find, findall, + sel_find, regex_pattern, ) +party_keywords = getPartyList() +party_keywords.append("무소속") + def scrap_154( url, @@ -335,30 +339,53 @@ def scrap_167( # return ret_local_councilors(cid, councilors) -def scrap_177( +def scrap_175( url, cid, args: ArgsType = None, ) -> ScrapResult: - """전라남도 강진군""" - soup = get_soup(url, verify=False) - councilors: List[Councilor] = [] - mlist = soup.find_all("ul", class_="memlist")[0] + """전라남도 화순군""" + browser = get_selenium(url) + councilors: list[Councilor] = [] + for profileList in browser.find_elements(By.CSS_SELECTOR, "ul[id='councilList']"): + for profile in profileList.find_elements( + By.CSS_SELECTOR, "ul[class='name_51']" + ): + name_tag = profile.find_element(By.TAG_NAME, "li") + name = name_tag.text.strip() if name_tag else "이름 정보 없음" + + profile_link = sel_find(profile, "a") + page_content = get_selenium(profile_link.get_attribute("href")).page_source + party = "" + for keyword in party_keywords: + if keyword in page_content: + party = keyword + break + + councilors.append(Councilor(name, party)) - for profile in mlist.find_all("li", recursive=False): - info = profile.find("ul", class_="info") - name = ( - info.find("h5").get_text(strip=True) - if info.find("h5").get_text(strip=True) - else "이름 정보 없음" - ) + return ret_local_councilors(cid, councilors) - li = info.find_all("li", recursive=False)[6] - party = "정당 정보 없음" - party_dd = li.find("dd") - if party_dd: - party = party_dd.get_text(strip=True) - councilors.append(Councilor(name=name, jdName=party)) + +def scrap_177( + url, + cid, + args: ArgsType = None, +) -> ScrapResult: + """전라남도 강진군""" + browser = get_selenium(url) + councilors: list[Councilor] = [] + for profileList in browser.find_elements(By.CSS_SELECTOR, "ul[id='memlist']"): + for profile in profileList.find_elements(By.CSS_SELECTOR, "ul[class='info']"): + name_tag = profile.find_element(By.TAG_NAME, "h5") + name = name_tag.text.strip() if name_tag else "이름 정보 없음" + party = "" + for keyword in party_keywords: + if keyword in profile.text: + party = keyword + break + party = "정당 정보 없음" + councilors.append(Councilor(name, party)) return ret_local_councilors(cid, councilors) @@ -369,14 +396,24 @@ def scrap_178( args: ArgsType = None, ) -> ScrapResult: """전라남도 완도군""" - councilors: List[Councilor] = [] - - result = requests.get(url) - result_json = result.json() - for profile in result_json["list"]: - name = profile["cmNm"] - party = profile["mpParty"] - councilors.append(Councilor(name=name, jdName=party)) + browser = get_selenium(url) + councilors: list[Councilor] = [] + for profileList in browser.find_elements( + By.CSS_SELECTOR, "div[class='congressperson_list']" + ): + for profile in profileList.find_elements( + By.CSS_SELECTOR, "div[class='col-lg-6']" + ): + name_tag = profile.find_element(By.TAG_NAME, "strong") + name = name_tag.text.strip() if name_tag else "이름 정보 없음" + profile_link = sel_find(profile, "a", class_="icon_btn") + page_content = get_selenium(profile_link.get_attribute("href")).page_source + party = "" + for keyword in party_keywords: + if keyword in page_content: + party = keyword + break + councilors.append(Councilor(name, party)) return ret_local_councilors(cid, councilors) @@ -395,9 +432,10 @@ def scrap_179( name = name_tag.get_text(strip=True) if name_tag else "이름 정보 없음" party = "정당 정보 없음" - party_info = profile.find("span", string="소속정당 :") - if party_info: - party = party_info.find_next("span").get_text(strip=True) + for keyword in party_keywords: + if keyword in profile.text: + party = keyword + break councilors.append(Councilor(name=name, jdName=party)) return ret_local_councilors(cid, councilors) diff --git a/scrap/utils/export.py b/scrap/utils/data_io.py similarity index 69% rename from scrap/utils/export.py rename to scrap/utils/data_io.py index 3c1795b..44f928b 100644 --- a/scrap/utils/export.py +++ b/scrap/utils/data_io.py @@ -3,7 +3,8 @@ from dataclasses import asdict from typing import Dict -from scrap.utils.types import ScrapResult, ScrapBasicArgument +from scrap.utils.types import ScrapResult +from db.types import Councilor def export_results_to_json( @@ -38,3 +39,21 @@ def export_results_to_txt( for cid, councilors in results.items(): councilors = "\n".join([c.to_txt() for c in councilors]) f.write(f"| {cid} | {councilors}\n") + + +def import_results_from_json( + input_path: str, council_type: str +) -> Dict[str, ScrapResult]: + with open(input_path, "r", encoding="utf-8") as f: + results = json.load(f) + + results = { + k: ScrapResult( + council_id=k, + council_type=council_type, + councilors=[Councilor(**c) for c in v], + ) + for k, v in results.items() + } + + return results diff --git a/scrap/utils/runner.py b/scrap/utils/runner.py index 01cb2a1..cfee3da 100644 --- a/scrap/utils/runner.py +++ b/scrap/utils/runner.py @@ -10,7 +10,13 @@ from tqdm import tqdm from abc import * -from scrap.utils.export import export_results_to_json, export_results_to_txt +from configurations.secrets import WebhookSecrets + +from scrap.utils.data_io import ( + export_results_to_json, + export_results_to_txt, + import_results_from_json, +) from scrap.utils.database import save_to_database from scrap.utils.types import ScrapResult, ScrapBasicArgument from scrap.utils.spreadsheet import read_record_from_spreadsheet @@ -30,6 +36,7 @@ from scrap.local_councils import * from scrap.metropolitan_council import * from scrap.national_council import * +from requests import post from scrap.group_head import * from requests.exceptions import Timeout @@ -65,6 +72,16 @@ def handle_errors(self, cid: int | str, error): self.parseerror_count += 1 logging.error(f"| {cid} | 오류: {error}") + def send_webhook(self, message: str) -> None: + webhook_url = WebhookSecrets.webhook_url + payload = {"text": message} + + response = requests.post(webhook_url, json=payload) + if response.status_code != 200: + raise ValueError( + f"Request to slack returned an error {response.status_code}, the response is:\n{response.text}" + ) + @abstractmethod def run(self) -> Dict[str, ScrapResult]: pass @@ -94,6 +111,9 @@ def get_records_from_data_source(self, data_source: str): def is_euc_kr(self, n: int) -> bool: return n in self.runner_args["euc_kr"] + def inner_euckr(self, n: int) -> bool: + return n in self.runner_args["inner_euckr"] + def is_special_function(self, n: int) -> bool: return n in self.runner_args["special_functions"] @@ -102,6 +122,7 @@ def is_selenium_basic(self, n: int) -> bool: def run_single(self, cid: int) -> ScrapResult: encoding = "euc-kr" if self.is_euc_kr(cid) else "utf-8" + inner_euckr = self.inner_euckr(cid) council_url = self.url_records[cid - 1]["URL"] council_args = self.council_args.get(str(cid), None) if council_args is not None: @@ -121,11 +142,13 @@ def run_single(self, cid: int) -> ScrapResult: if self.is_selenium_basic(cid): result = sel_scrap_basic(council_url, cid, council_args) else: - result = scrap_basic(council_url, cid, council_args, encoding) + result = scrap_basic( + council_url, cid, council_args, encoding, inner_euckr + ) return result - def run(self, cids: Iterable[int]) -> Dict[int, ScrapResult]: + def run(self, cids: Iterable[int], enable_webhook: bool) -> Dict[int, ScrapResult]: scrape_results = dict() for cid in tqdm(cids): @@ -137,9 +160,10 @@ def run(self, cids: Iterable[int]) -> Dict[int, ScrapResult]: except Exception as e: self.handle_errors(cid, e) - logging.info( - f"| 총 실행 횟수: {len(cids)} | 에러: {list(self.error_log.keys())}, 총 {len(self.error_log)}회 | 그 중 정보 없음 횟수: {self.parseerror_count} | 타임아웃 횟수: {self.timeout_count} |" - ) + result_summary = f"| 총 실행 횟수: {len(cids)} | 에러: {list(self.error_log.keys())}, 총 {len(self.error_log)}회 | 그 중 정보 없음 횟수: {self.parseerror_count} | 타임아웃 횟수: {self.timeout_count} |" + logging.info(result_summary) + if enable_webhook: + self.send_webhook("지방의회 스크랩 결과\n" + result_summary) return scrape_results @@ -157,7 +181,7 @@ def run_single(self, cid: int) -> ScrapResult: raise NotImplementedError(f"함수를 찾을 수 없습니다: {function_name}") return result - def run(self, cids: Iterable[int]) -> Dict[int, ScrapResult]: + def run(self, cids: Iterable[int], enable_webhook: bool) -> Dict[int, ScrapResult]: scrape_results = dict() for cid in tqdm(cids): @@ -169,9 +193,10 @@ def run(self, cids: Iterable[int]) -> Dict[int, ScrapResult]: except Exception as e: self.handle_errors(cid, e) - logging.info( - f"| 총 실행 횟수: {len(cids)} | 에러: {list(self.error_log.keys())}, 총 {len(self.error_log)}회 | 그 중 정보 없음 횟수: {self.parseerror_count} | 타임아웃 횟수: {self.timeout_count} |" - ) + result_summary = f"| 총 실행 횟수: {len(cids)} | 에러: {list(self.error_log.keys())}, 총 {len(self.error_log)}회 | 그 중 정보 없음 횟수: {self.parseerror_count} | 타임아웃 횟수: {self.timeout_count} |" + logging.info(result_summary) + if enable_webhook: + self.send_webhook("광역의회 스크랩 결과\n" + result_summary) return scrape_results @@ -229,15 +254,26 @@ def main(args: Dict[str, str]) -> None: where = args.get("where") current_time = datetime.datetime.now().strftime("%Y%m%d_%H%M") - runner_kwargs = args | {"current_time": current_time} - runner = ScraperFactory(where, runner_kwargs).create_scraper() + json_import_path = args.get("import_from_json") + if json_import_path: + if not args.get("update_mongo"): + raise ValueError( + "JSON 파일에서 가져온 결과를 MongoDB에 업데이트하려면 --update-mongo (-m) 옵션을 사용해야 합니다." + ) - cids_to_run = parse_cids(args.get("cids"), where) - if cids_to_run: - results = runner.run(cids_to_run) + print("JSON 파일에서 결과를 가져옵니다. 다른 스크랩 관련 옵션은 무시됩니다.") + results = import_results_from_json(json_import_path, where) else: - results = runner.run() + runner_kwargs = args | {"current_time": current_time} + runner = ScraperFactory(where, runner_kwargs).create_scraper() + + cids_to_run = parse_cids(args.get("cids"), where) + enable_webhook = args.get("disable_webhook") + if cids_to_run: + results = runner.run(cids_to_run, enable_webhook) + else: + results = runner.run() if args.get("update_mongo"): for result in results.values(): @@ -250,7 +286,7 @@ def main(args: Dict[str, str]) -> None: export_results_to_txt(results, args.get("output_path"), current_time) -def parse_cids(cids_str: Optional[str], where: str) -> Optional[List[int]]: +def parse_cids(cids_str: Optional[str], where: str) -> Optional[Iterable[int]]: if cids_str and where in ["local", "metro"]: return [int(cid.strip()) for cid in cids_str.split(",")] elif where == "metro": @@ -272,6 +308,9 @@ def parse_cids(cids_str: Optional[str], where: str) -> Optional[List[int]]: choices=["local", "metro", "national", "leaders"], default="local", ) + parser.add_argument( + "--import-from-json", help="경로에서 JSON 파일을 읽어와 결과를 받아옴", default=None + ) parser.add_argument( "--data-source", help="사용할 데이터 소스 ('google_sheets', 'mongodb')", @@ -280,18 +319,18 @@ def parse_cids(cids_str: Optional[str], where: str) -> Optional[List[int]]: ) parser.add_argument("-l", "--log_path", help="로그 파일 경로", default="logs") parser.add_argument( - "-m", "--update_mongo", help="스크랩 결과를 MongoDB에 업데이트", action="store_true" + "-m", "--update-mongo", help="스크랩 결과를 MongoDB에 업데이트", action="store_true" ) parser.add_argument( - "-o", "--output_store", help="스크랩 결과를 로컬에 저장", action="store_true" + "-o", "--output-store", help="스크랩 결과를 로컬에 저장", action="store_true" ) parser.add_argument( - "--output_format", + "--output-format", help="스크랩 결과 저장 형식 ('json', 'txt')", choices=["json", "txt"], default="json", ) - parser.add_argument("--output_path", help="스크랩 결과 저장 경로", default="output") + parser.add_argument("--output-path", help="스크랩 결과 저장 경로", default="output") parser.add_argument( "-c", "--cids", help="스크랩할 의회 ID 목록 (','로 구분, 지방/광역의회만 해당)", default=None ) @@ -305,6 +344,11 @@ def parse_cids(cids_str: Optional[str], where: str) -> Optional[List[int]]: help="지방의회 스크랩 시 사용할 council_args JSON 파일 경로", default="scrap/utils/scrap_args.json", ) + parser.add_argument( + "--disable-webhook", + help="스크랩 결과 웹훅 전송 비활성화", + action="store_false", + ) args = vars(parser.parse_args()) main(args) diff --git a/scrap/utils/runner_args.json b/scrap/utils/runner_args.json index ddad42b..d2dbddf 100644 --- a/scrap/utils/runner_args.json +++ b/scrap/utils/runner_args.json @@ -1,6 +1,6 @@ { "euc_kr": [ - 6, 13, 16, 31, 72, 88, 112, 134, 154, 157, 163, 165, 167, 176, 181, 197, + 6, 13, 16, 31, 72, 88, 112, 134, 154, 157, 163, 165, 167, 176, 181, 200, 202, 222 ], "special_functions": [ @@ -9,11 +9,12 @@ 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 62, 63, 64, 88, 97, 103, 107, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 132, 134, 140, 142, 154, 155, 156, 157, 160, 161, 162, 163, - 164, 165, 167, 177, 178, 179, 182, 183, 184, 186, 188, 189, 190, 191, 194, - 195, 196, 198, 199, 201, 203, 206, 208, 209, 210, 212, 213, 214, 215, 216, + 164, 165, 167, 175, 177, 178, 179, 182, 183, 184, 186, 188, 189, 190, 191, 192, 194, + 195, 196, 197, 198, 199, 201, 202, 203, 204, 205, 206, 208, 209, 210, 212, 213, 214, 215, 216, 217, 218, 219, 220, 222, 223, 224, 226 ], - "selenium_basic": [76, 78, 101, 169, 173, 177], - "no_information": [18, 29, 106, 111, 172, 181, 185, 187, 197, 200, 204, 207], - "error_unresolved": [170, 171] + "selenium_basic": [76, 78, 101, 169, 173], + "no_information": [18, 29, 106, 111, 172, 174, 181, 185, 187, 207], + "error_unresolved": [170, 171], + "inner_euckr": [200] } diff --git a/scrap/utils/scrap_args.json b/scrap/utils/scrap_args.json index e854932..1bb1405 100644 --- a/scrap/utils/scrap_args.json +++ b/scrap/utils/scrap_args.json @@ -644,13 +644,21 @@ "pty_elt": "ul", "pty_cls": "dot" }, - "177": { - "pf_elt": "li", - "pf_cls": "item_box", + "180": { + "pf_elt": "dl", + "pf_cls": "ml_desc", "pf_memlistelt": "ul", - "pf_memlistcls": "memlist", - "name_elt": "h5", - "name_cls": "dd", - "pty_elt": "dl" + "pf_memlistcls": "member_list", + "name_elt": "dt", + "pty_elt": "dd", + "pty_cls": "ml_data" + }, + "200": { + "pf_elt": "dl", + "pf_memlistelt": "div", + "pf_memlistcls": "list", + "name_elt": "dd", + "name_cls": "name", + "pty_wrapelt": "a" } } \ No newline at end of file diff --git a/scrap/utils/spreadsheet.py b/scrap/utils/spreadsheet.py index da8abcb..70652e2 100644 --- a/scrap/utils/spreadsheet.py +++ b/scrap/utils/spreadsheet.py @@ -88,29 +88,33 @@ def scrap_all_metro_councils() -> None: else: emsg: str = f"[scrap/metropolitan_council.py]에 {n}번 지역을 위한\ 함수가 없네요." - add_error(n, emsg) + # add_error(n, emsg) + print(emsg) if "정보 없음" in result: emsg = "스크랩 결과에 '정보 없음'이 포함되어 있습니다. 일부 인명에\ 대해 스크랩이 실패했다는 뜻이에요. 함수나 인자를 점검해 주세요." parse_error_times += 1 - errors.append(n) + # errors.append(n) + print(emsg) # print(f"| {n} | {result}") except Timeout: - emsg = f"{council_url}에 시도한 연결이 타임아웃됐어요." + emsg = f"시도한 연결이 타임아웃됐어요." timeouts += 1 - add_error(n, emsg) + print(emsg) + # add_error(n, emsg) except Exception as e: - add_error(n, "기타 오류 - " + str(e)) - emessages = ( - f""" - 총 실행 횟수: {N} - 에러: {enumbers}, 총 {len(enumbers)}회 - 그 중 '정보 없음' 횟수: {parse_error_times} - 타임아웃 횟수: {timeouts} - """ - + emessages - ) - email_result(emessages) + print(e) + # add_error(n, "기타 오류 - " + str(e)) + # emessages = ( + # f""" + # 총 실행 횟수: {N} + # 에러: {enumbers}, 총 {len(enumbers)}회 + # 그 중 '정보 없음' 횟수: {parse_error_times} + # 타임아웃 횟수: {timeouts} + # """ + # + emessages + # ) + # email_result(emessages) def scrap_all_local_councils() -> None: @@ -131,10 +135,11 @@ def scrap_all_local_councils() -> None: 167, 176, 181, - 197, + 200, 202, 222, ] + inner_euckr = [200] special_functions = ( list(range(1, 57)) + [62, 63, 64, 88, 97, 103, 107] @@ -150,13 +155,18 @@ def scrap_all_local_councils() -> None: 189, 190, 191, + 192, 194, 195, 196, + 197, 198, 199, 201, + 202, 203, + 204, + 205, 206, 208, 209, @@ -166,7 +176,7 @@ def scrap_all_local_councils() -> None: + [222, 223, 224, 226] ) selenium_basic = [76, 78, 101, 169, 173, 177] - no_information = [18, 29, 106, 111, 172, 181, 185, 187, 197, 200, 204, 207] + no_information = [18, 29, 106, 111, 172, 181, 185, 187, 207] error_unsolved = [170, 171] f = open(JSON_PATH, "r") args = json.load(f) @@ -180,7 +190,7 @@ def scrap_all_local_councils() -> None: parse_error_times = 0 timeouts = 0 N = 226 - for n in range(1, N + 1): # range(1, N + 1): + for n in [205]: if n in no_information + error_unsolved: emsg: str = ( ( @@ -192,9 +202,10 @@ def scrap_all_local_councils() -> None: + " 링크: " + data[n - 1]["URL"] ) - add_error(n, emsg) + # add_error(n, emsg) continue encoding = "euc-kr" if n in euc_kr else "utf-8" + inner_euckr = True if n in inner_euckr else False council_url: str = "" try: council_url = data[n - 1]["URL"] @@ -215,39 +226,44 @@ def scrap_all_local_councils() -> None: 명시되어 있는데 함수가 정의되어 있지 않네요. [scrap/utils/\ spreadsheet.py의 special_functions에 함수 번호를 빼고 \ 다시 시도해 보시겠어요?]" - add_error(n, emsg) + # add_error(n, emsg) elif n in selenium_basic: result = str(sel_scrap_basic(council_url, n, council_args).councilors) else: result = str( - scrap_basic(council_url, n, council_args, encoding).councilors + scrap_basic( + council_url, n, council_args, encoding, inner_euckr + ).councilors ) if "정보 없음" in result: emsg = "스크랩 결과에 '정보 없음'이 포함되어 있습니다. 일부 인명에\ 대해 스크랩이 실패했다는 뜻이에요. 함수나 인자를 점검해 주세요." parse_error_times += 1 - errors.append(n) + print(emsg) + # errors.append(n) # print(f"| {n} | {result}") except Timeout: emsg = f"{council_url}에 시도한 연결이 타임아웃됐어요." timeouts += 1 - add_error(n, emsg) + # add_error(n, emsg) except Exception as e: - add_error(n, "기타 오류 - " + str(e)) - emessages = ( - f""" - 총 실행 횟수: {N} - 에러: {enumbers}, 총 {len(enumbers)}회 - 그 중 '정보 없음' 횟수: {parse_error_times} - 타임아웃 횟수: {timeouts} - """ - + emessages - ) - email_result(emessages) + print(e) + print(result) + # add_error(n, "기타 오류 - " + str(e)) + # emessages = ( + # f""" + # 총 실행 횟수: {N} + # 에러: {enumbers}, 총 {len(enumbers)}회 + # 그 중 '정보 없음' 횟수: {parse_error_times} + # 타임아웃 횟수: {timeouts} + # """ + # + emessages + # ) + # email_result(emessages) def main() -> None: - scrap_all_metro_councils() + # scrap_all_metro_councils() scrap_all_local_councils()