forked from PradipKanzariya/WRPC
-
Notifications
You must be signed in to change notification settings - Fork 1
/
WRPC_DSM_UI_Accounts.py
207 lines (164 loc) · 7.16 KB
/
WRPC_DSM_UI_Accounts.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
from openpyxl import Workbook, load_workbook
from openpyxl.styles import Font, Color
from openpyxl.styles.colors import BLUE
from datetime import datetime
import pandas as pd
import requests
import os
import streamlit as st
import re
import io # Import io module
import pdfplumber # Alternative library for PDF processing
search_text = "Arinsun_RUMS"
def create_file(df,fdf, sheet_name):
filename = f"Extracted Data_WRPC_SRPC_{datetime.now().strftime('%d-%m-%Y')}.xlsx"
# Check file existence
if not os.path.exists(filename):
wb = Workbook()
wb.save(filename)
else:
wb = load_workbook(filename)
# Check if sheet exists
if sheet_name not in wb.sheetnames:
wb.create_sheet(title=sheet_name)
ws2 = wb[sheet_name]
# Write DataFrame column names as headers
headers = list(df.columns)
ws2.append(headers)
# Append data to the worksheet
for index, row in df.iterrows(): # Iterate over DataFrame rows
row_list = row.to_list() # Convert DataFrame row to a list
ws2.append(row_list) # Append the row to the worksheet
ws2.append([])
ws2.append([])
text_row = ["(All figures in Rs.)"] + [""] * (len(headers) - 1)
ws2.append(text_row)
#FDF
fheaders = list(fdf.columns)
ws2.append(fheaders)
# Append data to the worksheet
for index, row in fdf.iterrows(): # Iterate over DataFrame rows
frow_list = row.to_list() # Convert DataFrame row to a list
ws2.append(frow_list) # Append the row to the worksheet
# Save the workbook
wb.save(filename)
def create_dataframe(financial_data):
if not financial_data:
print("No financial data available.")
return None
# Extracted financial data
entity_name = search_text
payable = financial_data[0]
receivable = financial_data[1]
net_dsm = financial_data[2]
payable_receivable = financial_data[3]
# Create DataFrame
df = pd.DataFrame({
"Name Of Entity": [entity_name],
"Payable": [payable],
"Receivable": [receivable],
"Net DSM(Rs.)": [net_dsm],
"Payable/Receivable": [payable_receivable]
})
return df
def extract_text_from_pdf(pdf_url):
try:
# Download the PDF file
pdf_data = requests.get(pdf_url).content
# Load the PDF data
pdf = pdfplumber.open(io.BytesIO(pdf_data))
# Extract text from each page
text = ''
for page in pdf.pages:
text += page.extract_text()
return text
except Exception as e:
print(f"Error extracting text from PDF: {e}")
return None
# Function to create clickable links in Excel
def create_hyperlink(url, display_text):
return f'=HYPERLINK("{url}", "{display_text}")'
def first_row(extracted_text):
# pattern_combined = re.compile(r'(\d+ \w+_RUMS \d+ \d+ \d+,?\d* \w+)')
pattern_arinsun = re.compile(r'.*Arinsun_RUMS.*')
matches = pattern_arinsun.findall(extracted_text)
if matches:
return matches[0]
else:
print("No match found")
# Function to fetch PDFs
def fetch_pdfs(year, title_filter):
st.warning("Please select the week for which you'd like to fetch data, then click 'Continue' below.")
wrpc_base_url = "https://www.wrpc.gov.in"
search_text = "Arinsun_RUMS"
UI_link = wrpc_base_url + "/assets/data/UI_" + year + '.txt'
session_state = st.session_state
if 'checkbox_values' not in session_state:
session_state.checkbox_values = {}
response = requests.get(UI_link)
ui_data = ""
pdf_links = []
titles = []
url_data= {}
if response.status_code == 200:
ui_data = response.text
lines = ui_data.split("\n")
session_state = st.session_state
for line in lines[1:]:
parts = line.split(",")
if len(parts) == 5:
from_date, to_date, link, issue_date, status = parts
week = re.search(r'week=([\w.]+)', link).group(1)
yy = re.search(r'yy=(\w+)', link).group(1)
potential_month = yy.lower()[:3]
filtered_month = (title_filter.lower().startswith(potential_month) and title_filter) or None
if filtered_month:
pdf_link = f"https://www.wrpc.gov.in/htm/{yy}/sum{week}.pdf"
pdf_links.append(pdf_link)
status_text = "Revised" if status.strip() == "R" else "Issued"
output = f"Week{week}: {from_date} to {to_date}\n({status_text} on {issue_date})"
titles.append(output)
checkbox_values = []
url_data = dict(zip(titles, pdf_links))
checkbox_values = {url: st.checkbox(f"{title}: {url}") for url, title in url_data.items()}
if st.button("Continue"):
selected_pdf = []
for url, checked in checkbox_values.items():
if checked == True:
selected_pdf.append(url_data[url])
if selected_pdf:
st.info("Extracting data. Please Wait!")
table_data = []
first_line_table_data = []
for pdf_url in selected_pdf:
print(pdf_url)
extracted_text = extract_text_from_pdf(pdf_url)
# Check if extracted text is bytes-like object and decode it to string
if isinstance(extracted_text, bytes):
extracted_text = extracted_text.decode("utf-8")
# break
pattern_combined = re.compile(r'(\d{2}-\w{3}|Total)\s(Arinsun_RUMS)\s(\d+\.\d+)\s(\d+\.\d+)\s?(\d+\.\d+)?\s?(\d+\.\d+)?\s?(\-?\d+\.\d+)?')
matches = pattern_combined.findall(extracted_text)
headers = ['Date', 'Entity', 'Injection', 'Schedule', 'DSM Payable', 'DSM Receivable', 'Net DMC']
structured_data = []
for match in matches:
row_dict = dict(zip(headers, match))
row_dict['PDF URL'] = create_hyperlink(pdf_url, pdf_url)
structured_data.append(row_dict)
table_data.append({})
table_data.extend(structured_data)
first_line = first_row(extracted_text)
first_line_split = first_line.split()
f_headers = ['Sr.', 'Name of Entity', 'DSM Charges (Rs.) Payable', 'DSM Charges (Rs.) Receivable', 'Net DSM(Rs.)', 'Net DSM(Rs.) Payable/Receivable' ,]
first_line_structured_data = []
frow_dict = dict(zip(f_headers, first_line_split))
frow_dict['PDF URL'] = create_hyperlink(pdf_url, pdf_url)
first_line_structured_data.append(frow_dict)
first_line_table_data.extend(first_line_structured_data)
df = pd.DataFrame(table_data)
fdf = pd.DataFrame(first_line_table_data)
sheet_name = 'WRPC_DSM'
create_file(df,fdf, sheet_name)
st.success("Data extracted ✨")
print("Data extracted for WRPC DSM UI Accounts✨")
else: st.error("Please select at least one URL before continuing.")