From 33b43929115ea8916a2daa45030f3463e5e005fd Mon Sep 17 00:00:00 2001 From: Joshua Chen Date: Sat, 19 Oct 2024 02:44:28 -0400 Subject: [PATCH] refactor: remove stage.py, do DB committing in fewer steps --- docs/code-structure.md | 10 +-- docs/db_diagram.pdf | Bin 32218 -> 32075 bytes ferry/database/__init__.py | 3 +- ferry/database/database.py | 5 -- ferry/database/deploy.py | 139 ---------------------------------- ferry/database/models.py | 88 ++++++++++----------- ferry/database/stage.py | 84 -------------------- ferry/database/sync_db.py | 127 +++++++++++++++++++++++++++++++ ferry/transform/invariants.py | 4 +- main.py | 16 +--- 10 files changed, 176 insertions(+), 300 deletions(-) delete mode 100644 ferry/database/deploy.py delete mode 100644 ferry/database/stage.py create mode 100644 ferry/database/sync_db.py diff --git a/docs/code-structure.md b/docs/code-structure.md index f5db6b058..18ae734cf 100644 --- a/docs/code-structure.md +++ b/docs/code-structure.md @@ -32,12 +32,4 @@ If the `--snapshot-tables` argument is used, the analyzer will create a CSV file ## Database connector -The database connector code is located in `ferry/database`. - -- `stage.py`: stages the data in the database. -- `deploy.py`: checks table invariants, and regenerates the database based on the staged data. At this step we also run SQL to generate some derived tables. - -TODO: - -1. Move the SQL logic to the data analyzer. -2. Ferry should not regenerate the whole DB each time. +The database connector code is located in `ferry/database`. It takes the Pandas tables and imports them into the database. diff --git a/docs/db_diagram.pdf b/docs/db_diagram.pdf index 96c4f04a020152a0f3ea23d5b7eeca9bc35c8e7a..ddbc87e2a0f0cfcd787bee2ffb824fdda8567d0a 100644 GIT binary patch delta 10086 zcmZvgbx>Sgx8`wa+#z^^Yh&GwTX1)GcMT4WySoMm?he5%xCNKs5Fi96xXt^`+?u&J z@9e);{Z8$(p66M+>eSh5Pp}h@u!-x?rQI5pYpPJJMn2!Ep04#6L2T7LgT|50yBovo zjceVC4R2O)w`aoFj*T3?fy(_H%jbDpd^OME^*69r1UI}45NTAM=>_N1JjYzk zWr_%oDv!HY1#3lV-YVnP{SnQ7Z4|9lZY)EqFvzPmiYPsKa?@`%Pd|3wbyC_D|B>l# z!^|&OJ?j_$%6T@O=;d<|0$uW@_1M~pI=7@r>48;*>R|9aAW5fpI$f^)xx@jcE~pnk zECPW=ekJ`sz~0rLjv9t1eNPY*cS*5cU?_Yf+&yZGO7%gr@bBq-S|eA=Bik&^{Z#Ob z0}}#7D7)WpUt73xs-|mh)bTdpIJ(QexC$~6EpXT9ZxA1bwlwCp z)0rSaRahn==NWN%!E&t(|+72CVo!%)Ha>V zHK{Z1D2w0jp+xZM68BM}e1u5UH0_5kKkDp|ti}QukotZqViX+L;nzoYMzq`ps?O4# znX9Sw@S< zi0p?6w8qSek7R~dP@~!IR66&I=sCi5EiQP-ka?9iaV-^^evRiX=N)%dt;qKWviV@Z z+6|NbDFm;Wyn2Sk?_A1X(0lhX-vxM9-o2sD1#`JylXBMFw9mv|j28~%0SJ0My(Mv> zd%Zh#E8Su^bF0_c{YK@eeq24w7f*uCIH$GB!DidQ?0{N!C}={d zJwG%yADbDf|KSz=s-_3+GTiwpxr5FrCh!Ggg+A~#u;5L2ZYN!xt+Jowoc9VPA~J>so^`=v zCV#|Lt(o_R!KNQECALke!4q$X>iy(P%V|qZd z?^yZR(;!OLCF8XOf<3Z*oZq7`9Pn7)^Yiz$B$_Z_vD7~qzW;spW9}naTV+L}Ne(j! z<9N4bw0wD|_bLm$V?bk0DlYjTCN7x&1QK|CH0+R!rtTfj6Q}?xGE`30RssSKxK& zTXqSMOm`xE4S)Ig2ygTQF)p*VG>`rjfijtdG8@ZM@YT|JSAdr!1kEQj>%<5>w|ZqxAgL)+%^jXj1xf56~$6-rd z|3EPSsV~dIlxo5ZsP|b{K&nqGf>q56#9#`+fDclR<-=4evfPk=Djo_)ZE@08Pd`RL zSBW7`e#~R<{8M)tU8B!zHT1bg>B7GIXcf{pcP;L9r$*F?Q~v8-```58) zO|Co8xyXfDc?N@qpv%F$#79S^d~cMI{w%Vndv~@9n>)S~uGMa@vdIcP_(Lm&+=@mr z>F0TQxoyVW7_9bb=Mc1FXsf7ZRM^42EL0=}K^Z9qFX-_W{pk%ZoS&8Ig|oSX|e zX$L!Jk+TVNpp=MyVUkac>7`Y76GhXHv{)7(B zvKT*UXCtKkF7qgwp832nKtmPPXfBZ9WvDBwdmMgC)M8Tuu$y1IT8Q^z%UY!U>|`D5 zF0jjSgxhdJcD}@6xXY7z>}kW80a$VjEGob@(^Ey>hYqE6_&dtS=5{Ip0**EDT-Mfp z+Qo@FTa+=3rSX|KezB{6w^k<)WfU*m45hXZ$iT&2WB3~JJT9$eGk$?y?3nmTLsu>_*qmxadWh;$kNw@LJEojI|kB4H8m`aBCXtGm?neGn<&wTN}R;YSkfkx9qY1e{4NR<@~( z*PM%Bk%M~gSr{0W`67|((u`w#>y;}c1Q_TO&=|d%!+)otlA8gT~70+ef3q*sML~tVb*gx&C0Qwl+&o{88Se z963z{z)|*(zg6J6-MGy~tr&3nE~GyA@|a9Gs2BcKiD5-yMU2*%Dpu6~a$OC>3HuaA z54SWEg?b2fNgMt1N)Z4ShWN^`NU&`Hh%&jo0{EeW6oBzVZ{^8FKO6%ap8yFQ34QK= zP_!n$rGc!PHO~#0a3MQy@%2XX1WMw5t}bs}3FZNP;!s-YeQ@8X9v*Vm6i2t@*%q~- zaRyd%+`n*RNRyuL(}+l{fj?_+?dqw(V1PSSn*H}B1L5GOa9l}Y!oL}-Fg1WDfGE)8_bqLhLZT2wfD=t?^z%*8%B<&4~Uq@!TY9!6~~5Wbd12UhcM5t zf+Jcl3k&8s5}`?7YsV)Mce3Ks_+56YZw_Z&8L`GH+z`_5!QH__)au{{XCd`v zVWnhL`zaIl2S>pOdhxkbkyy-xbcsK)y|73!^#`okIhFc!&7NOwfzZb>hR3d+^W9%` zEU7>jVpOy5o=GiQ#&(=-khpA=#QJYjQ9+7DBgf%fA>dPj3gCc06rnzqmiw+x+{(y9 zoW;Z_N_%bTQw9W!+$!$3^UscUvcVp8(#Cyk`PT?Xi_3RI2$FgXbmeXtTnv6e{j-5L zl`T=th;@DdaKmAA4YY5D*(#`ek5!?r(;@Z6-4Q$}kpE1!FXSuOi_}tmq+DrD8%?kw z`$dVHZcB5}lC${bdt)05irMedl{bxHR`4}irH7Og@jcBmA%7#2=YIN@(6C~UN>B~x z=;m`xw4h$i)ya%C5@I{+tzq8fW`Nuzl~MNEjC*}t6(fV#HPo?-py+ee(u|G`|2tI{ z#_9rs>gFYj&|tM`mm&pS3J)(h)f;1i#dQb|qr^8J13iQKo8JbQjzb6NmO_c&qv{jB&F*DSl?{hbfBWuB6_Zms0a8L*u z0FMY}BlM5!@~MNDT+_DWLKwRqm4Kh#=;in}OD6r5zbGl-u+tg%TghTcUHPldYM?y-u3m{$IcoRWU%V14Z-rPHY?VSMSXQyFi(MT zEiB9wZua-8FSPM}_chLNHudh7AYQ+SB3PSX8`lM!#bPya%7eOqs(v5N6$as71&VLX zIZ~fky0wL+wjE&LP_ca$2cYfF@Q|Xu8jStGHzJtg~RTF|rRswZUBfWVS z(%M$mQ;-dh^R-NujHsIE$NAD_Xy?iICo^};Y&ZRlVWB4QvK$R&*2~{HzcM*<$~5f1yTU%82ba1wq z^3BQqm2m=%EZL#M$DbRP>i`Eds&>RQY+#ur4jnuCCmPVXUh;si7aw(v-&5 zmJH7sX0eJ0jjDMSo9g4vg-`u-nj@un&@MN#ab|(PD{(sy3cdvy;%Opae9xDTDqY&P zocdR?dth)nsn^nHIm*j8vp?@K#TAV*}4!fo>^p)b+u1t#t0yR2d8)F2dxx zyuVcg%D!l&;@{U**_p0S^gW@>hjp>8<0bxT6-Gl{(^@!FoyS+E<;-UT8>TGgxaSzr ztqPhkvw6?D*aZ7$CIlhvi(>8m6ip39m#>Ir1-J5us*wfKYaordq?DEy1EgcXU}utH zs1D&dx>q&)b6Dp0>z4BQAJcDNMZk#%Fd=%ry~&586I2+Ap<~M-1eu`8T8Ey? z!K@Undp-vLH>EOi;XIGweG5VZy)T|Cii{lIFRQWpppdMFx$kk3MYHDIIjojdv(sw3 ztd~n(de#2*)Ek+Ovfwhkxf*zX?{GM?po+d+r?ECd5;TRd1F;E>rJ?kpC4+E-VFPHX z!GS@#!BU`t)%*{1P|=h*&!MgqMwd-N1HNol^0aF5Cr7R3jImkAj1whA1vLQ{W(W&C zKTk+WQ|^h0PQNv;<&G3sBb{i9V(80l^!0n!k&5geikqJ%YVfKgU+T~8l^(3FP+aJ^ zxR|Y!J61F2?&n%>Q}GW*r>AEoChMW^1W5Dk>@Kkt3X1QT9pCF4B*K`92RK6qVk6KE zy7~u%;vJkR>Z=vxo#l|)2!>@eeMR4kdV^;ba3P>kw9Vs9bfG?&&i;wnm+~a zV#(TV@wp&**EpobegPt#$gpS9~_bw5n#0`JJ!oXAax`{k-XJ*_2epk5naf5 zq^i<8gwj#{+Euf*6gv`CRa6>_cdO38a_i&iBcUJSfVYD`?epH#0(`QT)~b?gC|f9R z<;_+8T2FHRC9e-X53jy{clXe$d5SB+eWP=|^R3+@D&?<~}1nVw-k?Lz}Kj zS$9H8HJ)*YD(8|)MQu~_$DdMh0oef5a2%F`dpDHm{e|2U*w^Y@A)3@AmI-<}(yH@Z z-cM(?9x-0WmA7f^(D4(Nhe6?ImmU6P*5@;}^#1CT=i&huEn!U_=S65(S`L#OmZP7q z)b1y>x%sT*%s+z?@?P&uOv8zZaiiyWcnil|``STEkYORzk*{c`zb*uK0+REm{dmB( z>g%Bl&vCIdc!mr3C3B=PcD97!Z==;!6(;80TyvrfZ9qb5*G^H9S+Ms)isgb*_lT2^sb+>v}Uzs#{ z+xT@2lE#mamYkQpWCgd6{p>ivNIaEiTC=#*!_?iOxzIFTQ8x{}7W_4YHh*qBBLET> zqIH;Rm2A$lEO{0}s8IiVrAifVIq9;XW> zc6>Kq(W-STwf?8d)O5yt_P4 z0NdNNS@Nz>Wieb(?|W_XcfxNcdG(pr5}~3Vzo=k^B>D9!B~wR6(v5U zkhC6K7cp7eiTSxcBT}#}gH6W((4-h!ayELZd>;OcOW1_pc)b+-tJ>HZLM|v#qcSh+ z5nSlGMZm>3AUfsV;me0VWxXO?$%r&aQbJOdul|WUKo1&seKi66u=XvuE|pL#iBvofE~JCvJZ#m1~dSm=c|oRa03c#!Wq= zr6i};R#1`eP1Ps?wno1q`X#((bO5(Pz_ zj}XE6>-|_^8_dn=M%A&Q{2>_Eh}``+SXosbkF0@F;W=_A(1G;{u8npmfg?h{m~AYn zxPy3V1k%yj65^3!p>&z*y}tq8?a;(7X-KpUVG z{1W8uB)yuwQ_GCKk1ADZ+UE0L&6d96^?b`aGsbE^Dp31?RkyhOj;2FtY&2P$G9row zdVG_B)yriAL5seV@YnlMnBZP}?lz)u7WP?^#SG~Eq?xw7@8L!EKuYdMEIUY(HKB@| z@-@4#xJXMIS8xapcOXN5M8#Aesk^AEJBUuUgq))ivSa+F7nwiDkN%RBpPe)*HX-vN zIV4zVJXLJYy+XAzjUw$`$w+-TZGegXq*J-51lVBijw=j%TLeWzv-B|XZjHZKkD=4S+XPd*-%qv zCe+%aDU*tD!a@*d;bjfw1;lC4gyTkOgrd!ipLQlrOlY) zk*H>hlwM5m2KJD`YTSk(n%i*xq%Z}o!c+$uda&k9rPuqe?9K7dp&C`P&u5(5UDDs( zR-W5in=jXM9@qUkPx$iM)qD2_y-t{=pw|}E8v!X>pJ1IUugu)OwQ;?Hg_tFZ%i%SA zJq9FRN{Rv`qshz%Bw^w62IM!lF6;3Aygf?MB@Ho|U_FP8e=z>=@q3jnv$|?kc3Zs- zcIhiGw`++XWsanG+PeIb4l47$qOO>!4iNdQSXIQ)($|>CPoYVxG2%f3`i%6cxizU{ ziT9O|fAG3kfFY@BAICMPOVzOmCuU#Po3>dso1Q#vd2C{TaKBSdeQbq>$F|K|klCD3 znSHUzg(W^Ng>MH#;Z^0^99%#mWvH2REW!LJp*Bm_O6Q4`4}L~HDEkxUGza^lYcZKe zw!a*1U09L!fPyks0+4p}beKR=af};g~eVj%%#Ti9d zatGZ$Tn{xXWuGqLm%dN0K4V@4uv2S|CeMmd{>!$xq1d(LN$SMD=TGGFjPEA6mW4zk z%6xL<{Rs^|$-Bk1$*d(Vw%U?Up(Sm_hTnYEGWpFHwIEX>hNwcW`EsixzkrC^#kB_zbpU(rvNnn_W zobjWZY4vC4D^W$H!*c-WF=kKv6<1y3|lW38gS`Z4zT`D_U8;Ab`Zm3?= z5}IjSYv7}WwwO7-Voz1cg3mIwJ{7;xJzeDY*?m8HT&=*Ug0dY5vmbe{ZrXHZ4Zn+w zs-AI%`J)kBD-rQ_h}es8n)h;=Aw;huRq{-@K3-K2s->zg+JK?E->thDDJ&M#+{V5e zk=Wnndh@^-Pg&+(7$+>%Cc z#>65?2gE%e%Aj9!*Y{j+-XPhjin-_^#;D#3QGRBTiippt`(^qRFTNG&CBZP}uvZ16 zNI>Czcq@1+u@?NuQttL%Rvpr^^rVO80QQs81$cr~I-BmOKcq-Ha2s@^yDjMu*0{wv zV%E4NRj^yq^1)=y;sR$QL(R#8Du0tG#?6nGY+cF6F{oWoMEO)@i;fB9=;Xj8Fl23k zqPRJIZE_N!Vxjbws)X8BO)rt8B@2ml5%0Zi<@EO!!L(^9DULi3w%U?L5TFH5Mi*^f z73F5h_*@zv&NqO3(ih>CkN5)|lvgx4Husz2b{oz!U9gebPrINExL(}*b2ufy zb5kEXVgOG6jMBHO8RmCc!U8oh)}HF^W6xw%NIp@=vG*sqQJ+Eh~$d` zG_0<+97^o_%~#deO-H{v%}vT{sfm_!TxjRWZ25~ z=c5!{R}cPg@vC{bWnBIxfFBe2Z)?*$dgwxo(%b^Q|9vRpvhsZPd%xVq5kw(LhyY<+ z0^dav%&W@wARHy`eL>$M93{5bTO~}d(RL-wtLU~r9F+2#@9zswI4wYR+#ms6my~f_ z-%q#x7{lve9F+e6t}st41W*Ul@4Fpy@#an zl@6O+ra}zK`WaIQglIA8nY~(&>7fMe_cUNa0;aCfurNG_F6I6xP#04$Uak8js$*?UO#5*c~lcLsheR5G5qVX0M@Vhkztwkw=j$a*v)K_)wJ zCQJ)uGH$?x_tH-r#;wsnfS$tbc5Ic7dB(%Uqb8T2`GSSWGHky9F~ZVu7b=cuIro^A z0~`$gr2;W`7NW%M?>~c~%uNVky0`+6F*t)bRt7ulAIp=6Z zO&SE?Mp-~xaKT$f-KX29)`_Rm{Ek@TER%TG=vqppopfqUa3?s4^zwGUZN%F5ww-|h zg~tS`aRV$I&Hw9W=by^S)E2MBhPvQ)tys=R{8U z#l71JF1r3w-3RI3)O@HbqmF!^_2i)Aa@>_CF)wnN1j^uXtoP--Mx;c5!FQT9b`J>M zSZ+U5hhpp?L+Sr8OV73%e=prL48P!$N09)X^QYWybeyJ~DqXfChBDR31tfpQU^Dqn zj4b@cy4b$3p_Slhs<&IU;iu`%9kxUfMT+`3{zo-RuCm0h=4?@{O|+7hG?q>ZhC_xD zh8i4N#k1A3-{=qUY%r^Nl?`1CX@#x~q&nnemZawvs+lVoXR0}``PB+coitQdx$}L6 zZ+M0m#s$8uiywc!_R4q5gm1sPLz_=en@yRbr-J23IirUlDU0hWXvwEQK?EtzJRed( zo_Hyq3^V^7n=oF&0-H3LhLn(afgmmpU<#Fw6$_A;2SNq>&!Tdq0{rg+$PMBAXE1*; z5Em$=$%hWg!S&Z84j>l@^cUmg;^E-_n}LB`ynit+AP+akUyOr`7yK_tJV5UMBMJK7 zbn=2g|I*0|g8VIp0|@+=z<(IzZ!sJ|Fz>(2adG@l!_a?X;o#!n0{=@j_rKh6aB=@j z;6FRL{&vT~&B6Vzv^YVa{||#e|K7~W#mVuvlYiKM#r6Nv;^h9v@?WAkK^&mJjr_xS z{~ie^hzIyzNkjkdG5H5_^8D+CVBWvu=LB=`{!1V)1oE%Oyu2KLH|7Fza{tdB>_Ff@ i5cIc`e~vTf-?RJA=y*~>{1pHk5D0*lRzgV<@V@|Wrjr-| delta 10222 zcmZvgRahNev#o=>ySwXR!CJUG3GObzHNmx!;O-D4K+xds1a}P*+?@m`IGpd_XFuoJ zJKvc%UH$f~t{P+3)f{_(S$>2`SOpZdY3wLJol9t2S1+Zf`_>}ye5+Ub<8|A$u2k=9 z%+@GWZD|v?(QDG|t1+~H$3jsu_0xrrGq#lpngb-MSp%~>Ia<(rD+V+PxFWNAgJ0izz#f5WuNy@ zpB~vO2r)_rbe=;}Ab}-)O(;T4aIp|W%W2n3>9UJ;IIQ!Y8qO1l+U=B<6zY8lFOU*y zUdVaphSq<58bi1_11p|BO99TL)Bp5Y%JUI>_=x~9JmyLSJ0 zdTZ+AX!ut0aCb&H;ZuVwIr<)sDy5YKJy-YNpPxJde=DyP9lEA00vG8q7o)^f1_Fpiva%t)D=~ zokq{DO4{lh-Iz7(zPKz*g`~aV?ewT*oifGaHq0Zgt@f9@QvHk&3-K#IlkA|$?kLYV+ZN!lKe=o+iSr_6tFhiopwi?Codt%_Cw|fdPTVNp^pDgMhb_N9=~q3~ zZuQu=tlT@@@ue-6X~(JcZn2+sVm$8um_B;LxDxoa{nU45ks5QIUf4eCjT_(d(FZIp zv-Xn@U0ove(R-YPU``K*wQ=2$T%~k z$SevE@@{W8Uz|bnYJ!$XxTlEX?o=no)@jQqU{e`{XJdAj(^jVdOs1LRrh3)N-$d#OW}jGD4SKp! zRB6GgKfveq&70m|xDb^W_c6YPCUNGQ*Q0&1Ybk6Bm=Hoy&Dl)0 z`;bro=my5JFUD{yzOU`jWt~TPspNSvB3>*aZp2>n!!jdZoZSrn8iXvSD{?xBv^aVX zQ>NUFnA#H-B+e{L5niHJ?^Nvn!@Td)GL)TF$sIPVR0dt)c{1JT;it!dc2#P<`P2fF zSCjFzL$>|VJx#}BS+o(HD%sr~n|a*~E8DLqK*2va{#b@erfHC44Cf8U0u8%H#35d` zf^~M8XD+2J5-D|~pI=mlLboLH^@kL!e3-fc+t>+2uV9m-7!!_j-|R8UqB>4YonO)8 z)7hIwj5|gmuGdFe+b$iOR8!3-1bmfJ{U)%ZTBJJ0WToyV$ zP962ij#i#x+qV~|Vn&&-2*%19`L7~8z=0DXe~W5n63eDMp7YgJaP{W6 z4R!P5_mr6E9IMycr;IOiHk|^=Us)TtYRnW}o*uKVm zx!M@&c+@$wGexwoy}%~KmOg+CUJ z9g#Va{Tw5+nKEX4qg>MrA?Y{H2g0+tyarMs3Fxda)&!_DtRW{n~jSw88 z?#jhLZx=Dy1274HCL*5G{fvKIU!R(r~)k!>#EjX&7KD1LruX`Pwmr zJaS~JhUc+CQt66FiLzk`PSSZ(L@w{ub7P0lF=@2YiaTd}Ubn&;{>+p!%&s!iin6+V zZ48YcOZqUF8A|WlsN1nMqgP-Q;w8=w6NSmtgH5}XgMoAD$6OnHvYR9tVpzBfdx9g3 z*ecJ=v&FP*kn_>-@MY@+0Z4XASx7Wxe}8KiI}$kLi~qJBRG>$@X*alKZu32L z95x&>W8j&%h@N*z%QdPnVikAYC1M=S5+)?6#A_2WTGJ!Z-LNGKhY1T@=&8C@xnpA( zv-J?(n&zuVd#Qe;Br-U+!T8CB*F<2@vWClBEpz{;LesjlulM9z4!{{~&$XghZWKp# zy3ldFz`B2Hf9&ynU+qxbDpqma36&zyZ}}l>^~b#E$Ca=u8;i7r73OKxh)E(;oNgfQ z&%2$6HRUfTWxdYRT{?Zw52Hk1O~YDLm>1t+OQ)JsGfN_0u4&4VAo}$9Ikp8gvDgre3az|`zzM=|QAO>IvSuV%#tobn-+?FS9z0fhdbGoc0nQY$9s;{czS@ii)T&9}7hM5QtQ?)Y~&Rw0S$L zYqp}qI>4QRNhrvi!*NQ4fbCPiqP`;M)X!bBMNZQkWX3Gbo(M#J;^$0+=#AMRYFS&U zmM!?r#InB~rYIF}mD=xu*^3dQf4!0tkoT@ApvM_tJOi+;;9iFL%)8s;}e@nqn z$R6>Ll>VbckqrDiYdBw$Yzi7V*&jvwD(cL5RWag@Hy@$XlZb`0Zgj=_{PuY@TmdOT zO@T-XF7W(upWXqe)MUqn2*V)`{HBEa4lQkA&psz^^#D?d&jHlCxo z8MhAGtR_N3-d)eheJ3{Ty89DD$0S`0B9khHGT;thV^~{E&^23G@F)#CQj?(K)YsC| zSmEY26rn;1su!aKNVzG0qgtS?^hUhs+Q6%gkG&+k?5zPNWk*1bKsJVF%N{hGj!lj5 z8||^T)-{>(!&NhXa{bHZn*D5EXsCCiEer%mwRx=hv1`1 zP&jz&R$+0iKRD}i2Q{wJZ^NjV{AQ`*Z#T0Zs$dC_mey~|!o?m_q4KrQXe&(yuuF*r zbBjg8x{p8x%nB%P2)*!3u3jnHUc2bsm(tQLI=*=2A8LU-QnRprd7b8q6a7;sS;Cmk zmuWoFiOh%`lzKui11pTkm{jSgTy{XoJ{_a3(~~r#ZL| zGYy@QnrbIx9Gnf{6Zm^6tS8P~XgXWT@!-lJh0DPEgI21x)J8XaybSmJj0(L(i!C9d zIICVAj?_Y7}_+Y_AwM-ErkS;CB3N&rf^# zPZ<4MNmgkzJKLiPWQ3CJ9oeh~{om*C^jAHs-$p-7YeYpTjW@NKo}_P(uSQW&j>psP z%9*Y4I6&c98gx?k2U!tVmedc1cHZ*+sn}is5cor8N1!>~MIDYV7-JanRmzJA@95r| zyBc)9vE%#b-n(GjC1Ms0d6jDZF(Cme(`99>M?4UtTBUPxtW|6AHu4{Af$pypod`~z z_&l#*wf`2abvA0q;#hna&qfZq^=##TmUqsG6i6lXMZv)AJ3RWePiV$?kVL$|6w9q!u* z#{LS6M1;@C8Q29Qv4|SrafMqlV4I=V{@tPW@|=Kqs62%UHozQbdvl)wLT=FKT`%Sx zTS;eeMN78Wiv!pEGHHYhTo-MptFN^IPmdy(ceyzyKF;CqwovdXPm`&Xj`y2LC2$oC z?CIn5;mISe);}VP=sLSP!cO?hxOfluwMwDCnUIUDRv)5KYDWk;lpVFG+!iGDgZcAB z`W1B{Ft`;lfmbPFhyC|0YU-r{jhs)%eCCq$6~(qV>GnhT-7DQcuotHQhl*#`k1+4IhZo;BI)!7x;Ai6Aaxin z*r1ld=iF?!?*VyDcaC%915uzx-KHJgjf~!M;(LmHswHL{y{Yxe93FrIh!dX33)@5} zOHsj;5`99-h}LA*J4ARxJ%EY@kN)$Yr#IWD5`F?IW|T5o#ssDs z7aAeaqMl}7QmaB`Gp2EDU6knOg`JzED*Z#%4mr(3U!jt3h?kn$#kOw~eeSc}SO@E1 zyVYyP9Dmp5n{PT?=Hl=TG~)DqThFFBY-^U`umfsF-P{r@D4=drdu--We$-`o!jvH$ z4{`Ryxqhi13`cJeIH2bhBI%;rV{DsdpEX!IeXR5Q-bIRU8GoP&C06o5S6pBGhB6!pJ%dS(CC@%GY` zX%cgvs$6egQTlV8G#!adpPwA4pi2s$2v5V@2TwyPAJGC&H}(cDk^6X)Uy#!#Ms1k3 z0E`%^AWKIbLQ0T;Y~un)l=~qw6VY2#+Cj>4vJ+)A>n!Vb!}siQ{q|eiH>Vtpu`-SC z4_W{p_Zw~r4(hwHazbsYtcF&+Zgm)HBZHT6f)^C7Sjd~jKkELlNEp-adh}@nWdL1>iwhQT0Nx92?84UC_zo$18Zd^5~nhNa(j_t+fp>X`k zrJ_4&24h69Pd8hPAc3a1GMv+eV^X)+CX|-`vi_cnbT6O6QOyG-cLZ15ENU9&3 zUOCc+Q-U5T-&}12QAW_KauQ8_JCui%yfblKK1~UQxhD{}G$kJpVEhBn9A7uNZg9?G zl!$h376(o3$z1Bohwp6F%+oH7TXcwW{XD{u<;)tq^IOjw`E@fw$v^C4c`KpfYzTa% zg(6}x-=urJvoa~5tWxM$-d`XpF?5goDkiCM%Q7j9`d(rqGeXMGbM#`3#oEDGdxnp} zC;WFIZWdH?T*(%iLBfV%V$DnXPPTr@yovO;FkerS>ARYNB{@=_7?NUi*D!`Y!PPI; z%-n<-sN?r(e)fsA)0#G%4TiQxmU@8Yn(dgg@D((#NB&J|k=32JC$(g93NVq9#q zcIQpP^Xtb98R8?|2dmNn?ozadl$e1Ba8|-LhkLPve*z4@UIiA3vdqwt2y181>itGg z*lhzRI$f^jtvrx#kFyMwbyQ4_??#T`vRUQ^TK(1?7*v1Gc~^shq}Tyr!vF|FJ9z zW1>}Vv)RSt~A8!3(0zY!L2;6z;QjAV0uZevv2IJ*PM|$*H*6@?6;2@oUH8ncmLdWylnJc@rzTx z3~mbD4ZC13h5tB0z+&4f&7HWEByr zW|m&Hgffu^H~GN7^ZpS%`xi^Z*MVEPF!2D^lhE3#$ctG+uf*8*c*ct?qeVK}^K%vY ztxju-gr*t;oEK$x(jEQ9Tg%p2Szom9;A&N*nNKa!LC}FQ#3_LJ0@X8#qc(vou70%p z?ND~O2ogC3gi+7zvw7n(#W#U`@Ozd0mM*YAj=Nl);MlhPj@GY)@|5g`>mEAsPuS-P zUnD`REXS3YQ>i5LjRiqklL2N&<&Q;TIbVF)e4l!XP|A*0sDxhg4V&E6j>;s2#OcSV z<8aOv1%pPYEj@s8URUnu?o6d8$u<$+QX|?oPJ0I32wGe4WYEQ0Ju`|hD{hi;*%9WD z`K^YD4?+*o%fDQsmdRcBi5gk#_Orp?cNuCV?&`dE+8&48J%#uT-Q^}@qD!gA%(q%1 zxn~$=5cAi}bfSvP+5e;{Sn_)f+uBA66$gqU6DfX6s+0$oenCFGenK!RD7n4=Z&jf+_Oq=$66LMz*~Ba8;f{)6F^AG-3R9JWEQmNN z(_07;y?w>kB{3WS4Hlk%2AKkWVj4xGM`_`!)jmKSxeei|$we2XpX8a%C!pXM`&psa zUy-n{F4IfQvWPUt zy-u#&0_6+AF=S)TUPS;nZV)&($#3N|*6-}anp^*zz8!Ig zmj#URDCIItic*d)*Lq*YP1}4JKd`XK4Byt>psvaqHW6%%abQJe`Y{L|1Kn;d`XHUS z9MiijRgUrsTE~5oaJD7SSt5G4PhnX2<}vf`cXQ)m;OEMGMFG+wTVlwIa)GE8`XV00+QsM)|YHI!IDfeOL|@HBmYXcgiQ`AY}S$ zI?GL)n?L(_>BB^RpE>cO?^4HU1Y!@^|9LG+w&T~Acpr9I9wqRZx3tKChLBx$x^ZwO zDEHut;09N^JBLw>pN;W|LW{(C7qW_C%mTZRX53iJ`_R~dju@RQea=~R+f{9M7D3<^ zE3rpt_n^y%n6l|eGz{}zArf4`7Tz->rd*$}T7vnW9jr}eA1ejNLWHQ1Z1~3_d6FXz zD~@IV@?;o`H4d$JL~Q3Surk*_m}2D9R!x{yc+_sfAQ0Ep1%!P(XA&zF&&T*+)ETnwm_KMoBJirL9qT^qx8#4nr# zFKy8ySrbat$MN+hLyW=|J|0Op?`ofDwlDqWH=eU%NUOJ9wOVN5xCA%fggzWSAPew) z+-ZE!50Um1aWv*3yT(Y&c;M}MT+aZ-h4!x$7|3@K*{ym@}7I|!KWM+?n8 z)A3(<_10ppW0Y9bja#y0Dj0I8{+K}nW#&doploSW?J$ElL)K?jmgFglew^ygXUp3h z<_xMFQ-tOBt4u1HZMiqlC5052j3)K|Ff3F5mP(oMxv(j}C4Jj0>Wl3K?|>KPCtBW_ zlbL}eZVBZOGY5Fg^dWJ~u|7apV9W)E2Pyl?F=&BmhBFZ>@ ziui);LaKIyWetOvi8yb7&rYobF1yC9JK_wz;qI<7LtC=*5b~AxkrF&rxw5{$daevg z34*y1d&65>u19iYA9l_8RcZ~{i{i#kK0?8vh6I08Zm7^~S)ysvg6vLD2Qd9wn)y`BDwC)7R>}j?~{_Ip+vB>;GKi>Rj5-FDy$R z>ssI2|KgoWzRZ)ZN*WsbeTmqbqhFE2wudn->z(T+9Uy7UMnwypa;kAyfaeO=a4{+p ziX=nkx{GVPATq~6I=qiexQb|Z22ULPsB(IsMu$+Xq{z;Zt|BB~6#Eim>n-*xH4dFo z%aTQ%_a8*advKkC5nH|plr@q044-ms8t5ud?JBl>p-PoK|9VcE9eAsN>4KNctTL$f z#5^#2j=n0LDzdTSmmAnht1kxMU9{D2&hD_P>oFkK}=tNSQh-v(&BP}q(Lz3p|l z-g-YuRLXIuwji#ZJ*`#8H>VtHZ<`x&{drEteie>tw#~-FHmE_{E--${)qzP5-G#_k znRAva2`^vbCv|!^%RJOY`j`g5w|}`4c`}BQPL2xl$ASUf%%_-R5b~wY;cwDuqT}rT zw9p438-_p&iRU4dnlCVbCAT_T2q($$uG^|)lal%b!uMLfX}Y~FS})1yEL|dpU`}03 zUijk9(w`qPIZ=p@JR1~xjtCsLmKa|AuVBB*zG%#!O+T*WwM?`PZt!ud8qzXddOHf2 zVT6Q11Tp}hZ4(xi7*ah=!|gNk^tU-Yyuaq^Sm-;7KZ(whMkGRlf)@?n#`7krWLB^G`!F2*mu3K~jEt4Wg zI-GC5IV^`aijM2P9ScBS@(tS+Ln6wFQ@rPEiPQlkP^t?Xa@B(arz;k8@iMIWOs#pTEJD>77;(8yOC zu1FiQX$^0U922;*;rnN^(xZo6YtB+$*tJJxaRA*jM{{>4eZ3tI+b@Jny$q85#JBU* zdyVv9Fkaz*g=R@*xbI~vOC-i_c5(Os9pA%cs)@~H!?z`s(e|wsjgpQsNsMOtm|H=l zG#!vK!+a^?V%0j|=^&~DM7FKAEDASQSqfbm=- z=!^8l&d!IY9aLe;@S)EgPr3*J*JcR-{s30UT~8wQD|OV__k;bjp-o>oiYE~$rMu}_ zyVd&1kmtl5W}_EVl$^~IorLgHO(>P}fRuCleLk;W`_RaShebRdR?>qhPuJcc+@t3O z9PI5G+uoz+){DNkiPz=?^@a`*_1JrvCE+Kvc|x%G92nw>=Es3zLU0$=-3g2w_DKZ2j9i?iR(=JZVaWxH$CS}+vTS++J@y&4xwKpqpyN#dN+~CT`=A&dyR%^ zc=sP&KT*mRa~Gu8db+47wF^W1W{t<+k&B8l_86jU=PI^$azRtbe)^jLBuFLGrM#QT zz~MDkBt04PzmAVE(`7`U>BZv?7CyF82 zfQj6P*lw-w%Y!;7PxBDNR@%hsnC&QOL&^b5KCi6qV*|NM1G()V%-S*+Y0T^F=Zry6 zR6}*AEhR~>knT}z&MNMU0a&)Tx;SZLM44W#2t=7KW|`q!6<+}W8I|j5^M!JSf&YZ)bF2HQCi?19$>Db!d*DA$ByV5u;JB7UHL z+xr8)^@QeKg;#djrj~KG+U`DBSXZ<3oEA!`+&k3MiNsU7q;asHj4ADs)wY?qRQ1y)wIN zImwf=x`*IF(4h%eJZ{hGY>T=}a_Iit=n@Uul!t|fE~cJ~+X6&X-;^-+OCtuDBV(&q zjqd8clsWDi{(jhxTrDVFd($N--HTg9d+W&f?UluJG|;C$0z(((4btktuenTNM*KyL zJQe_bczbwb{!v|61=BofZ6J9YR2I#@BOx#og>oSyoR7F;`{Rx_29LPh6n%maPX=p5 zSi(&wPf4OgJoGq9Tn=H`5!1;t-@n?yHLp^;DQS>YMhSVMf!7uNv%MwWb0q4RY!G6Q zPmrL~Hm6|F>L+3LF|~fwwo(F46mu>%Yt&!@J)-iK6H&kwEQC9Gf0p|8qdGw*YVNF>2K)7wbq^z@sI6HXFI%D7=XF)Yv5PqJ6|#$dnySvX20 z>aN?5T4jVgH|aX;Cd|z$GxIDPR^;iAq#ew}@w$OK22CR$7VmAcpe@C^^{KUr-u)wj zoo5DpvN#hp3?$i%i62o>L0?`@D*2p`puT|#0p@?aC(kioz(GNL^@A(}Du{d_J}`(U zInmpi9R%VAQFH%iQ9Dti{#U@w%Ma!M7lR1!B-8sa0uX_}K7qLTxS@YB9x#OOUknQ3 z=lN?Nm>a^+`xgW8f_VPb5+Ap~|7{8QAAJfy{~P{K#?SM25fC@ezZ(39fw})K%q;+h z{LR2TeE*99|5yY9^YH&G8O--Dbs#W=`=86v|E&LKE(pxe|F=32FAx8}-1?`)|A&Er zzms{uV94K!|6%-p7v|vw@%<|o0tWppga-l!|9vDp5D4#o{}1?oO8$Y6zsH0JA^`q- zK6s#D-oG0Jb94X8{(l(X--W^4|Je1v9OU{>CNJo3C14N;^1sgMAJ6|8aVUTCwck5b PFc^f&z#yX{i~4^6VZQwN diff --git a/ferry/database/__init__.py b/ferry/database/__init__.py index 37eec554b..30ba88b33 100644 --- a/ferry/database/__init__.py +++ b/ferry/database/__init__.py @@ -18,5 +18,4 @@ course_flags, course_professors, ) -from .stage import stage -from .deploy import deploy +from .sync_db import sync_db diff --git a/ferry/database/database.py b/ferry/database/database.py index cf52eca21..186811b1c 100644 --- a/ferry/database/database.py +++ b/ferry/database/database.py @@ -21,11 +21,6 @@ def __init__(self, connect_string: str): "keepalives_count": 5, }, ) - Base.metadata.create_all(self.Engine) - - if any(not table.endswith("_staged") for table in Base.metadata.tables): - raise MissingTablesError("Model tables should all end with _staged") - self.Session = sqlalchemy.orm.sessionmaker(bind=self.Engine) diff --git a/ferry/database/deploy.py b/ferry/database/deploy.py deleted file mode 100644 index 93eb67d8d..000000000 --- a/ferry/database/deploy.py +++ /dev/null @@ -1,139 +0,0 @@ -import logging -from pathlib import Path - -import sqlalchemy -from sqlalchemy import MetaData, text - -from ferry import database - -queries_dir = Path(__file__).parent / "queries" - - -def deploy(db: database.Database): - """ - Deploy staged tables to main ones and regenerate database. - """ - - # -------------------------------------- - # Check if all staged tables are present - # -------------------------------------- - - # sorted tables in the database - db_meta = MetaData() - db_meta.reflect(bind=db.Engine) - - # ordered tables defined only in our model - alchemy_tables = database.Base.metadata.sorted_tables - target_tables = [table.name[:-7] for table in alchemy_tables] - - db_tables = {x.name for x in db_meta.sorted_tables} - - if any(table.name not in db_tables for table in alchemy_tables): - raise database.MissingTablesError( - "Not all staged tables are present. Run stage.py again?" - ) - - # ------------------------------------- - # Upgrade staged tables to primary ones - # ------------------------------------- - - print("\nReplacing old tables with staged...") - - conn = db.Engine.connect() - - # keep track of main table constraints and indexes - # because staged tables do not have foreign key relationships - - replace = conn.begin() - conn.execute(text("SET CONSTRAINTS ALL DEFERRED;")) - - # drop and update tables in reverse dependency order - for table in target_tables: - - logging.debug(f"Updating table {table}") - - # remove the old table if it is present before - conn.execute(text(f"DROP TABLE IF EXISTS {table}_old CASCADE;")) - # rename current main table to _old - # (keep the old tables instead of dropping them - # so we can rollback if invariants don't pass) - conn.execute(text(f'ALTER TABLE IF EXISTS "{table}" RENAME TO {table}_old;')) - # rename staged table to main - conn.execute(text(f'ALTER TABLE IF EXISTS "{table}_staged" RENAME TO {table};')) - - replace.commit() - - print("\033[F", end="") - print("Replacing old tables with staged... ✔") - - # ----------------- - # Remove old tables - # ----------------- - - print("\nDeleting temporary old tables...") - delete = conn.begin() - conn.execute(text("SET CONSTRAINTS ALL DEFERRED;")) - for table in target_tables: - logging.debug(f"Dropping table {table}_old") - conn.execute(text(f"DROP TABLE IF EXISTS {table}_old CASCADE;")) - - delete.commit() - print("\033[F", end="") - print("Deleting temporary old tables... ✔") - - # ---------------------- - # Rename _staged indexes - # ---------------------- - - print("\nRenaming indexes...") - - # sorted tables in the database - db_meta.reflect(bind=db.Engine) - - # delete previous staged indexes - delete_indexes = conn.begin() - for meta_table in db_meta.sorted_tables: - for index in meta_table.indexes: - # remove the staged indexes - if index.name and "_staged" in index.name: - # conn.execute(text(schema.DropIndex(index))) - renamed = index.name.replace("_staged", "") - conn.execute( - text(f"ALTER INDEX IF EXISTS {index.name} RENAME TO {renamed};") - ) - # primary key indexes are not listed under meta_table.indexes - # so just rename these if they exist - conn.execute( - text( - f"ALTER INDEX IF EXISTS pk_{meta_table.name}_staged RENAME TO pk_{meta_table.name};" - ) - ) - delete_indexes.commit() - - print("\033[F", end="") - print("Renaming indexes... ✔") - - # ------- - # Reindex - # ------- - - print("\nReindexing...") - conn.execution_options(isolation_level="AUTOCOMMIT").execute( - text("REINDEX DATABASE postgres;") - ) - print("\033[F", end="") - print("Reindexing... ✔") - - # ------------- - # Final summary - # ------------- - - # Print row counts for each table. - print("\n[Table Statistics]") - with database.session_scope(db.Session) as db_session: - with open(queries_dir / "table_sizes.sql") as file: - SUMMARY_SQL = file.read() - - result = db_session.execute(text(SUMMARY_SQL)) - for table_counts in result: - print(f"{table_counts[1]:>25} - {table_counts[2]:6} rows") diff --git a/ferry/database/models.py b/ferry/database/models.py index d7ffe501f..deff47d52 100644 --- a/ferry/database/models.py +++ b/ferry/database/models.py @@ -20,11 +20,11 @@ meta = MetaData( naming_convention={ - "ix": "ix_%(column_0_label)s_staged", - "uq": "uq_%(table_name)s_%(column_0_name)s_staged", - "ck": "ck_%(table_name)s_%(constraint_name)s_staged", - "fk": "fk_%(table_name)s_%(column_0_name)s_%(referred_table_name)s_staged", - "pk": "pk_%(table_name)s_staged", + "ix": "ix_%(column_0_label)s", + "uq": "uq_%(table_name)s_%(column_0_name)s", + "ck": "ck_%(table_name)s_%(constraint_name)s", + "fk": "fk_%(table_name)s_%(column_0_name)s_%(referred_table_name)s", + "pk": "pk_%(table_name)s", } ) @@ -44,7 +44,7 @@ class Season(BaseModel): Seasons table. """ - __tablename__ = "seasons_staged" + __tablename__ = "seasons" season_code = Column( String(10), primary_key=True, @@ -68,17 +68,17 @@ class Season(BaseModel): # Course-Professor association/junction table. course_professors = Table( - "course_professors_staged", + "course_professors", Base.metadata, Column( "course_id", - ForeignKey("courses_staged.course_id"), + ForeignKey("courses.course_id"), primary_key=True, index=True, ), Column( "professor_id", - ForeignKey("professors_staged.professor_id"), + ForeignKey("professors.professor_id"), primary_key=True, index=True, ), @@ -90,19 +90,19 @@ class Course(BaseModel): Courses table. """ - __tablename__ = "courses_staged" + __tablename__ = "courses" course_id = Column(Integer, primary_key=True, index=True) season_code = Column( String(10), - ForeignKey("seasons_staged.season_code"), + ForeignKey("seasons.season_code"), comment="The season the course is being taught in", index=True, nullable=False, ) season = relationship( "Season", - backref="courses_staged", + backref="courses", cascade="all", foreign_keys="Course.season_code", ) @@ -257,7 +257,7 @@ class Course(BaseModel): last_offered_course_id = Column( Integer, - ForeignKey("courses_staged.course_id"), + ForeignKey("courses.course_id"), comment="""[computed] Most recent previous offering of course (excluding future ones)""", index=True, @@ -293,7 +293,7 @@ class Course(BaseModel): last_offered_course = relationship( "Course", - backref="courses_staged", + backref="courses", cascade="all", remote_side="Course.course_id", foreign_keys="Course.last_offered_course_id", @@ -301,14 +301,14 @@ class Course(BaseModel): last_enrollment_course_id = Column( Integer, - ForeignKey("courses_staged.course_id"), + ForeignKey("courses.course_id"), comment="[computed] Course from which last enrollment offering was pulled", index=True, ) last_enrollment_course = relationship( "Course", - backref="courses_staged_", + backref="courses", cascade="all", remote_side="Course.course_id", foreign_keys="Course.last_enrollment_course_id", @@ -321,14 +321,14 @@ class Course(BaseModel): last_enrollment_season_code = Column( String(10), - ForeignKey("seasons_staged.season_code"), + ForeignKey("seasons.season_code"), comment="[computed] Season in which last enrollment offering is from", index=True, ) last_enrollment_season = relationship( "Season", - backref="courses_staged_", + backref="courses_", cascade="all", foreign_keys="Course.last_enrollment_season_code", ) @@ -345,17 +345,17 @@ class Listing(BaseModel): Listings table. """ - __tablename__ = "listings_staged" + __tablename__ = "listings" listing_id = Column(Integer, primary_key=True, comment="Listing ID") course_id = Column( Integer, - ForeignKey("courses_staged.course_id"), + ForeignKey("courses.course_id"), comment="Course that the listing refers to", index=True, nullable=False, ) - course = relationship("Course", backref="listings_staged", cascade="all") + course = relationship("Course", backref="listings", cascade="all") school = Column( String, @@ -386,12 +386,12 @@ class Listing(BaseModel): ) season_code = Column( String(10), - ForeignKey("seasons_staged.season_code"), + ForeignKey("seasons.season_code"), comment="When the course/listing is being taught, mapping to `seasons`", index=True, nullable=False, ) - season = relationship("Season", backref="listings_staged", cascade="all") + season = relationship("Season", backref="listings", cascade="all") crn = Column( Integer, comment="The CRN associated with this listing", @@ -401,14 +401,14 @@ class Listing(BaseModel): __table_args__ = ( Index( - "idx_season_course_section_unique_staged", + "idx_season_course_section_unique", "season_code", "subject", "number", "section", ), Index( - "idx_season_code_crn_unique_staged", + "idx_season_code_crn_unique", "season_code", "crn", unique=True, @@ -421,7 +421,7 @@ class Flag(BaseModel): Course flags table. """ - __tablename__ = "flags_staged" + __tablename__ = "flags" flag_id = Column(Integer, comment="Flag ID", primary_key=True) @@ -430,17 +430,17 @@ class Flag(BaseModel): # Course-Flag association/junction table. course_flags = Table( - "course_flags_staged", + "course_flags", Base.metadata, Column( "course_id", - ForeignKey("courses_staged.course_id"), + ForeignKey("courses.course_id"), primary_key=True, index=True, ), Column( "flag_id", - ForeignKey("flags_staged.flag_id"), + ForeignKey("flags.flag_id"), primary_key=True, index=True, ), @@ -452,7 +452,7 @@ class Professor(BaseModel): Professors table. """ - __tablename__ = "professors_staged" + __tablename__ = "professors" professor_id = Column(Integer, comment="Professor ID", primary_key=True) name = Column(String, comment="Name of the professor", index=True, nullable=False) @@ -487,11 +487,11 @@ class EvaluationStatistics(BaseModel): Evaluation statistics table. """ - __tablename__ = "evaluation_statistics_staged" + __tablename__ = "evaluation_statistics" course_id = Column( Integer, - ForeignKey("courses_staged.course_id"), + ForeignKey("courses.course_id"), primary_key=True, comment="The course associated with these statistics", index=True, @@ -499,7 +499,7 @@ class EvaluationStatistics(BaseModel): ) course = relationship( "Course", - backref=backref("evaluation_statistics_staged", uselist=False), + backref=backref("evaluation_statistics", uselist=False), cascade="all", ) @@ -520,7 +520,7 @@ class EvaluationQuestion(BaseModel): Evaluation questions table. """ - __tablename__ = "evaluation_questions_staged" + __tablename__ = "evaluation_questions" question_code = Column( String, @@ -560,29 +560,29 @@ class EvaluationNarrative(BaseModel): Evaluation narratives (written ones) table. """ - __tablename__ = "evaluation_narratives_staged" + __tablename__ = "evaluation_narratives" id = Column(Integer, primary_key=True) course_id = Column( Integer, - ForeignKey("courses_staged.course_id"), + ForeignKey("courses.course_id"), comment="The course to which this narrative comment applies", index=True, nullable=False, ) course = relationship( - "Course", backref="evaluation_narratives_staged", cascade="all" + "Course", backref="evaluation_narratives", cascade="all" ) question_code = Column( String, - ForeignKey("evaluation_questions_staged.question_code"), + ForeignKey("evaluation_questions.question_code"), comment="Question to which this narrative comment responds", index=True, nullable=False, ) question = relationship( "EvaluationQuestion", - backref="evaluation_narratives_staged", + backref="evaluation_narratives", cascade="all", ) @@ -622,26 +622,26 @@ class EvaluationRating(BaseModel): Evaluation ratings (categorical ones) table. """ - __tablename__ = "evaluation_ratings_staged" + __tablename__ = "evaluation_ratings" id = Column(Integer, primary_key=True) course_id = Column( Integer, - ForeignKey("courses_staged.course_id"), + ForeignKey("courses.course_id"), comment="The course to which this rating applies", index=True, nullable=False, ) - course = relationship("Course", backref="evaluation_ratings_staged", cascade="all") + course = relationship("Course", backref="evaluation_ratings", cascade="all") question_code = Column( String, - ForeignKey("evaluation_questions_staged.question_code"), + ForeignKey("evaluation_questions.question_code"), comment="Question to which this rating responds", index=True, nullable=False, ) question = relationship( - "EvaluationQuestion", backref="evaluation_ratings_staged", cascade="all" + "EvaluationQuestion", backref="evaluation_ratings", cascade="all" ) rating = Column( diff --git a/ferry/database/stage.py b/ferry/database/stage.py deleted file mode 100644 index 4547dd0a1..000000000 --- a/ferry/database/stage.py +++ /dev/null @@ -1,84 +0,0 @@ -import csv -from io import StringIO - -import pandas as pd -from sqlalchemy import MetaData - -from ferry.database import Database, Base - - -class DatabaseStagingError(Exception): - pass - - -def stage(tables: dict[str, pd.DataFrame], database: Database): - staged_tables = { - f"{table_name}_staged": table for table_name, table in tables.items() - } - - # -------------------------- - # Replace tables in database - # -------------------------- - - # sorted tables in the database - db_meta = MetaData() - db_meta.reflect(bind=database.Engine) - - # Drop all tables - print("\nDropping all tables...") - db_meta.drop_all( - bind=database.Engine, - tables=list(reversed(db_meta.sorted_tables)), - checkfirst=True, - ) - print("\033[F", end="") - print("Dropping all tables... ✔") - - # Add new staging tables - print("\nAdding new staging tables...") - connection = database.Engine.raw_connection() - Base.metadata.create_all(database.Engine) - for table in Base.metadata.sorted_tables: - if table.name not in staged_tables: - raise ValueError( - f"{table.name} defined in Base metadata, but there is no data for it." - ) - # create in-memory buffer for DataFrame - buffer = StringIO() - # TODO is this really needed? - staged_tables[table.name] = staged_tables[table.name].replace( - {r'\r': ''}, regex=True) - staged_tables[table.name].to_csv( - buffer, - index_label="id", - header=False, - index=False, - sep="\t", - quoting=csv.QUOTE_NONE, - escapechar="\\", - na_rep="NULL", - ) - - buffer.seek(0) - - cursor = connection.cursor() - - try: - cursor.copy_from( - buffer, - table.name, - columns=staged_tables[table.name].columns, - sep="\t", - null="NULL", - ) - except Exception as error: - connection.rollback() - cursor.close() - raise DatabaseStagingError from error - - # print(f"Successfully copied {table_name}") - cursor.close() - connection.commit() - - print("\033[F", end="") - print("Adding new staging tables... ✔") diff --git a/ferry/database/sync_db.py b/ferry/database/sync_db.py new file mode 100644 index 000000000..5e77eaa0d --- /dev/null +++ b/ferry/database/sync_db.py @@ -0,0 +1,127 @@ +import pandas as pd +import csv +from io import StringIO +import logging +from pathlib import Path + +from sqlalchemy import MetaData, text + +from ferry import database +from ferry.database import Database, Base + + +queries_dir = Path(__file__).parent / "queries" + + +class DatabaseStagingError(Exception): + pass + + +def sync_db(tables: dict[str, pd.DataFrame], database_connect_string: str): + db = Database(database_connect_string) + + # sorted tables in the database + db_meta = MetaData() + db_meta.reflect(bind=db.Engine) + + # First step: existing -> old + print("\nMoving existing tables...") + + conn = db.Engine.connect() + replace = conn.begin() + conn.execute(text("SET CONSTRAINTS ALL DEFERRED;")) + for table in Base.metadata.sorted_tables: + logging.debug(f"Updating table {table}") + # remove the old table if it is present before + conn.execute(text(f"DROP TABLE IF EXISTS {table}_old CASCADE;")) + # rename current main table to _old + # (keep the old tables instead of dropping them + # so we can rollback if invariants don't pass) + conn.execute( + text(f'ALTER TABLE IF EXISTS "{table}" RENAME TO {table}_old;')) + # rename the primary key index + conn.execute( + text(f"ALTER INDEX IF EXISTS pk_{table} RENAME TO pk_{table}_old;")) + replace.commit() + + print("\033[F", end="") + print("Moving existing tables... ✔") + + # Second step: stage new tables + print("\nAdding new staging tables...") + # TODO this should probably be done within one transaction + conn = db.Engine.raw_connection() + Base.metadata.create_all(db.Engine) + for table in Base.metadata.sorted_tables: + if table.name not in tables: + raise ValueError( + f"{table.name} defined in Base metadata, but there is no data for it." + ) + # create in-memory buffer for DataFrame + buffer = StringIO() + # TODO is this really needed? + tables[table.name] = tables[table.name].replace( + {r'\r': ''}, regex=True) + tables[table.name].to_csv( + buffer, + index_label="id", + header=False, + index=False, + sep="\t", + quoting=csv.QUOTE_NONE, + escapechar="\\", + na_rep="NULL", + ) + + buffer.seek(0) + cursor = conn.cursor() + + try: + cursor.copy_from( + buffer, + table.name, + columns=tables[table.name].columns, + sep="\t", + null="NULL", + ) + except Exception as error: + conn.rollback() + cursor.close() + raise DatabaseStagingError from error + + # print(f"Successfully copied {table_name}") + cursor.close() + conn.commit() + + print("\033[F", end="") + print("Adding new staging tables... ✔") + + # Last: drop the _old tables + print("\nDeleting temporary old tables...") + + conn = db.Engine.connect() + delete = conn.begin() + conn.execute(text("SET CONSTRAINTS ALL DEFERRED;")) + for table in Base.metadata.sorted_tables: + logging.debug(f"Dropping table {table}_old") + conn.execute(text(f"DROP TABLE IF EXISTS {table}_old CASCADE;")) + delete.commit() + + print("\033[F", end="") + print("Deleting temporary old tables... ✔") + + print("\nReindexing...") + conn.execution_options(isolation_level="AUTOCOMMIT") + conn.execute(text("REINDEX DATABASE postgres;")) + print("\033[F", end="") + print("Reindexing... ✔") + + # Print row counts for each table. + print("\n[Table Statistics]") + with database.session_scope(db.Session) as db_session: + with open(queries_dir / "table_sizes.sql") as file: + SUMMARY_SQL = file.read() + + result = db_session.execute(text(SUMMARY_SQL)) + for table_counts in result: + print(f"{table_counts[1]:>25} - {table_counts[2]:6} rows") diff --git a/ferry/transform/invariants.py b/ferry/transform/invariants.py index 9bdf21ade..bcc99b8f9 100644 --- a/ferry/transform/invariants.py +++ b/ferry/transform/invariants.py @@ -27,10 +27,10 @@ def check_invariants(tables: dict[str, pd.DataFrame]): f"listing.season_code != course.season_code for {diff_season_code}" ) - courses_no_listing = tables["courses"]["course_id"].isin( + courses_no_listing = ~tables["courses"]["course_id"].isin( tables["listings"]["course_id"] ) - if not courses_no_listing.empty: + if courses_no_listing.any(): raise InvariantError( f"courses with no listing {tables['courses']['course_id'][courses_no_listing]}" ) diff --git a/main.py b/main.py index 596770b8f..d0301779e 100644 --- a/main.py +++ b/main.py @@ -2,7 +2,6 @@ import logging from pathlib import Path -import pandas as pd import uvloop from httpx import AsyncClient @@ -11,7 +10,7 @@ from ferry.crawler.classes import crawl_classes from ferry.crawler.evals import crawl_evals from ferry.crawler.seasons import fetch_seasons -from ferry.database import Database, deploy, stage +from ferry.database import sync_db from ferry.transform import transform, write_csvs from ferry.transform.to_table import create_evals_tables @@ -53,19 +52,6 @@ async def start_crawl(args: Args): print("-" * 80) -def sync_db(tables: dict[str, pd.DataFrame], database_connect_string: str): - db = Database(database_connect_string) - - print("[Stage]") - stage(tables, database=db) - print("-" * 80) - - print("[Deploy]") - deploy(db=db) - print("-" * 80) - print("Database sync: ✔") - - async def main(): args = get_args()