From c972592a2ffb5554969dac49ec0beed6181548af Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Mon, 3 Nov 2025 05:51:31 +0000 Subject: [PATCH] Optimize regex operations and remove unnecessary HTML prettification Co-authored-by: TheLovinator1 <4153203+TheLovinator1@users.noreply.github.com> --- __pycache__/scrape.cpython-312.pyc | Bin 0 -> 31836 bytes scrape.py | 116 ++++++++++------------------- 2 files changed, 40 insertions(+), 76 deletions(-) create mode 100644 __pycache__/scrape.cpython-312.pyc diff --git a/__pycache__/scrape.cpython-312.pyc b/__pycache__/scrape.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..526121026d6d0a75a786d48f2e35fb29f9a08cc9 GIT binary patch literal 31836 zcmcJ&d3apcl^^(C?Hf=9ps;U+jX;4|2!bmJkOD!F1Q&1tXh~cI7x4;Yf!O@2APS{I zn{9j2V9XhUai7a18g4$rOqmO$wU$Ndcjl*cZ69_=u2 z+->d*CvXBk!kv}g{8^rbb!T<#tv{=0Z^KywdmGOh+1qs1#NOt!X1sMHmQm|j>!{_d zg(p1v5!lrC)H%ytOwng2BFxe`MSI80ELavY}Btp-?0g3nfD7 z>$bD?LYaTFQ0~tfenDv~?Jaoxjs8r3=8&!skpdq!bKHS7dEf? z-avWct4U}?`W-?O%WpHFn-ShAY*E842=4;+-KptWowmv_CZQEDZGwW+AZ$Z;&xSc} z$Jch$&ME95JVQERCt^DMdxc$B>Nrj;K(BDaf1tIc;zNALdvZYOAD)}yr+9DMZz;al zt4r$7^mipKf-mF`1xEc|K501V3tjf=lNtRlp6njzI^5lLw5Rt_Qr|f?l{6mpYuC6qJ{by(`NQ^_x>0{H=)2_i z1_5gv@xSaJNt%Yn#Zg}Mn;u6Ty9L&CQ4mdJ*fXJioQ_yf^GdxPA?Au)caS1J(TN+c)v4uBm>X_x#vo zE@BAuudn?Fiap=Q;sG1BoIih_z`SAex%031ws79mE>EJ z{MIDjn&h`7`Rz%5N0Q%}YwOu>;H+-$W7}ax?w8%xUkgl2&3wbW&P~wN3icfnBI@Cg430}mgzRJC#f04V&|4aQ< z-mCXv9peuALxYz+NBU0mdN9C!o*{93)Z_Du~ft?70Z^Ygfsu<^!4d)e?4j` zTQzdkyMOoC#ARf!ILcySv<#kFt@ z1{42J7>=EwS3&-&j^|zksUmo3@`7}zB;iYu0c(U;Nvo7d zA||M+MoKKcz&KR~)MI?Y59)DMBvzpUn36_ea&#h?w00|0f>8~z2H!w0ym*QA1ji34 zghH&Pu$vllfb~ycDBu^Q$xjUbe*6W$2S=X#X3W`~;@cg!I_3)&T9@*db2^`Jy!8mb z%G|rB64^PoYj4%QUj1hMYxVQ}ixqckmWvL@vwKz?JyA!`6FmXTNgSNZy_!K0pCueU zpRKZQpGhB6v91!}?;l$^M=nu~+!U&@dEtd+%Z`tnZlW5w{}B8(iuw(He+~bw5)^%& z@2oMsrHAkB&7GU|_ndrZt?8c2Kz_F!(f7*vPOssf$3XsSN_Ve@Cx0zZFka^0Z0hVV z-)rGJw>j=@HIsigp}g15;5*3wK3BfKQvZIr4jA6A;K^U9hd-G?ydJ3i=mcoKxP~0j z%1vbD(O%3P;vt?*ij)bMjx*3Wfjc^doG>oKiagdm3Kuc0580JZ!zve`#DM&95q(5I zq*L+*uaK#H9nMmG+FQ^gg>E=Ui&1<^jtap8Au@mjxs~rS3HoB}#&w*CN`ky*pPk5$ zKg8yFZO}7{2{sfM^wCo3p+4|j^wDyTP*2|p7Q=eQgI?K)5`8I}qLH9AJ)O$ld_jhv zV@N(vZB(?Lt3+~8MU4yDnBY9ks(G=oX0)+J@YEb`t2y3Q(+A393WWn>L*wD1eq~fL zMyW=^3rblN><^d8LExY-Q4drUNT7>Q9*`Iw2%tT5&pNbZ@EL*ldwsBVHP5X6w)V*b~ zZfXCIj{V@+f9PGd98EZLK6`BE9C->sl7O}?US78BQ^?WNU^dG1jn3@OHtt?q`Ti39 z`^7xmL5x-PA>M(JfluMNjVTw*3#lK|Z4m@snckR0I*<*O`xsU?11bXG6ZFdT9Zo|E zG%nzdLBTuCV>?6pxpj{+iTgNAU3IF;Q&)veTjupsSEq0fUQ_tT^2Rpp%%t`E%vj(R zZ5*cM8r1PXWcD9B`G!WueIa>EvL?$04~G60TK@;x9y~}?WypttsIU#;yzbEnFgI#B zWWw@alx7h-;1fwg7A}=0n1pKG9IFX>Ez+XhP4G1Q#plTBB!|Wcn-}6C_`EtcS;WKe z2X%1NX@cFx`e||u-zN!BgmaDi#LYPhv5rc^wqe=Qm~dvV8mz{gFFSWEoGbfgE`aa%R6`I?``Mdu9*nLFtds9 zQ@FRWUlw$NUNC5@qPCm9WSr(P@Clkd!_-de9xi@r|41zIYLJ8dO$)gNhkk*&!#!Bv*_BVo9((GhUIxb5lpc>9}QW#4rPHexd z3IY8R*r5$bDRDaNc}g#_t(`U0pp^GZl`VgxvUv-_zv>zv9Swv$@6)I)+A&Ted!Hn<%U zZ_`$DIBaQb92y@P8Nb>O0j(kAedHk;gf@p}q7;Biz2#kwU zP2mc8k+|$1yh3akwVM1<=GhL03n|k7ihpo2g|$0x^xrf_c0*vr0=fFSLA zNfgqCOu+)`LgBE=8}=)_VN&lC1QHbwQhwq|bU#AQLp}WiooD(FpEx~mptHX_nN1O0 zCypQQ>F@8B<4jV6g>{XiVMC`Nc(Bv#NBX4RKlU=i6!!F~4LLF;cSs1#I9fp(58-Wn zlNTq%aT2b1{7is0J~%ijiU7KLnHo&WL|QE6-F#EHNLff}gt2cbMsI#n4^ALy3#y2d z4pzAV2+1cVLrEiX{uA0-o6PR=jbK4PDHVgEW;^GkDR>z?>BlSWp+q&W~| zYi!a$!aU-J2S#bMO3ZNvNkCO*Kw^xEVIHEmq#klPv5OkXj7AEG5CR&;d`L^0IImgS zq}wsH1~9S)21kWtrV@}#&%#>QGo~PEAs|);()$Ey$s^twkHkI~Nv$Zc1~dgz*k}^) z5eFT-3+5DTbOC40Ojt4!mXd@e`+>zibNJgCt9DbF>DtlNOf&Z2WpyiM+hb+h7q8qm zESDW$DeH@s^~KB1%w&FKFMQz0zIpWe(P+VQck7lNM-w^u5CuAlRx`a|&TBTmbN;RK zi#73@T>y5L67aF>$D)PXmL1y@IR&dG&Q-ADERQ+Mmz@>rS1{nqjy=-%lIk}*UhDYo zzLnzESaIu&^+ADWrJyNR(6li1!@lK$&Xs~Av4SHBNBJYYuFy7PAwE9mX6E(Gxy%LQ zo#5iG_ok!y9e4YqC;LCNoLMy)%54uaT{nlX53lOse^_3dsI33g4DXCyO`FIlo_k?F zv=CZ6{O^wctK*CNqK#cq$ALt4Uc!}=$jbdYtI=wET%_(8b}U(zEuCzqaE`}TVZ~k- zv)3sb21V}KzJx8qA!W<(`^QH%20gKCISIfAcK3QD+ZSJ1wmhdGc^V`=??3D4-o)L@ zZaA3B-P^Yb?)#hed-3{@o4g0J^*_zx$<3`h2U zj@Ye7{|PSbQm#E~m!b+Gz|ip89tFD*dH2J18DGGbt=R{q*||U}Dr2G?uwzNL6iZT8 zdHLNgsa?KcXEUuAEMR`rB{HIx_5gPQqW1H^G;N$VMNH|$9|}t)SV>(NmC6Nmck+` zJvMFK02V531$Npd;fc5Z<4|$aNPir&9pYHBFaJ;)|RN^I?dc)XkT=6c$A%ZWbF+e32APzx{7`!uF~vXbL6rBhsY(EzR(!ye;fV z`C?l`nTRPTf#W0E6A)!er--G^q|zRJ+`IbTq-_xFSIFN_%Q5HWMd}=|NBk>6%2HY| zpIX{Se4`fy-`;Q?DmkE3dLT4DK+GFh)Ao_^K}h`f!XrWc9;w-8_Zm0f`A+v+-3tv% zx_I@TYtR?t>4&gif5i_4zX+u~G>l2BjI~D)Uj{(ZA&2_|p%MR63)))p?PmITsN>h_ z@tUqoisP5SI|Q4sx;OjBnn!(svF0)V)nK#W4?#xJ9M(5_&OOy#s0+Kjo^$6&v3KD@ zol=?Ru(^q%ytZ(GEv1z`2LhshkeE@MExfHC13^gvkE}Bxu9b$!SCvm+^$1F;CR@1r z#qmkcpl{6ca=?F;07pINNCTr~?gdnFr!tn%#$fnThmtp25*(VZ`ouA?!6M1JL?1a{ zC1*RFq?P(&1cndLeE7#E!A7BDMbKHuDE(tXkT{*B$u|LQi2!D993~Wk_#!2zP6z+y z_g_g`Lg2|DFZL&GOrgT?i6n&aI>a%8vrAphxF>Np#Tcbb^OV{T1kqp+4GoYs=anQM zNE!$tX+>`gNU&xZHffVR0|6nKq4-18=SjN~gql1^QhCP0$&SRy<)t5v;-3Ke7x)XV zViaHF9-5s|b3Vyr!1_e8FqPbg^ zEiHh0Xmzhx3uD&8x$1@dsI@R|ZCT}-jjfBP9u}9px#zV#-|d(&f>o=mhNAaDzGpr+ zo?rVZr?=)^KQz-h69V6JGkiV#?a178+}V(5YL2=#edNkrH3I?)9Z%(qHDRxqKlOoq zQ=*_W;cAFx*DbWfvKv;xvSnWHS+#J5Wive=ISb|s7q-4tx>B(-RCQui`uB4W?7{12g~mgb=A?BL(_(;W#%XjvPXr;X3(5i6BI z;S3>e(s)9Om-%n8Dx~s<8fSyn+Ma?vEguDF83*ci>>1GD7E_=H4cwH&n-TtNd1DX3 z%ZMhXXM&fDKC)gS_0~lwz5JqQ!Y?u*&e&jj;gdR_Jaxo1onzJt*xgHJL>FXp1(}R{ z2wE~4fXE*!GA(2&CD1g8+T2vl7}P}=5)dt|wiq+6Dkw|0L^4feT!c6YUwBg&gilzK zWKj!P9j5aPO6FN=v7Q9p=DhXKS+$ADya!qXaEwvlnkg(YJ$C9r?y_FNNn^Y zWd8u`hf!EDN^&%j^c>^;`21Ap+5iKT27B|{rE-mE+ zZ1lnSf=RzpfW_|tItNoa=!XN5Og2|jd+j(DzfU=XaoddHL2kjE`^|#a3g$O2i1EU$ z@!ZxKTOzOM_T^ib=OXdEh8cU(p8Kbp6LqaK$LH!laMY|?k?^sDGur>?=pP=9x_14r zYH2jyc5K;j9E(qL%e51Usx2#3J7QHk;#Ipo@)iO+p|e{=>n$&GEgb zmkoU=Yt`0IIg_!6k6FrQth0MmwbPbG@6x$tOV0y)>0Dr;c-g-FK~CdB;KQ6<4_tY- z9k(3O(!F=BAG(fwtO%XjmjWL;4u6tC*hJcoCv~I3=XMxO_A!C4q?k(CiOEz6AJj*q zPDBI+ofd`>v;JF#XulpUMC6gy4D z5ZObL>{Nop%C?b+9duFBsY*5(ARue;FkL_}Fgh{9Y>%Wu%gB;On2HTv7HLsf(6C?}YK-vEfXh(RIw*Z~;0a6-9{Gyt5?rAdG|4*3sY`0hyfN5h;zv#0-FtHUGkJaCw>DdpYmS77%#z2dm^b5AqR$Z z8shSI@r_w?Ce1=12vdrwHRf639m=&*QY(OSV?9s%i7P>E|4I3sAY!o-z}L7$VcDC- zuNBXq`fk~b{y~1>?U!%89No0%Zsq;H<@{4K=0sNZ?9N&5+}63lxij+>Q75?gYBv16 zM)6+(Mbdbjwfu|ZcOU5K>pF4zz`(Jd-lGF2J4yQAEB{asz7AoWR#;Bo@ z6_Du^J~tTH1bv93+q2K%`?~RU;0u3lJ(ht0B^Gq+?`LzM?tTKWh05V1v_$J7LoehWCwBhH+|xnyBph zhBZ;eup8Dy#gVoqYWgp1=`UZC|9gX)Xf3||nJxa0X;W-x*0$*^f1?_je;wX+>H9$w zFY_32wDA#4>1^<|O%BMAIsDdP-qQ@x`F{BYb?32vZXr5GFTQP;xtbEBu|v zN`6x;zbT%-5py=Acge&(EmuK|M7!vM++edC4`OdK!1B4Y0*Uf#`_swlzhEPLz zXY{Sn#qQ-zZHdCtmBRX1Vg17Pcwx&*;oex`-lfTSVfRYm(OBWpc;WGx?gUtoRyENDEp`#Y)GP7?ETpyTwemSFNq5Auc?=~)(m+PO?(Cv&DwyqRD7b|@3ZhpM5 zXQl8&tnfs<@YENhJF}corQwi@v@=!+WV=6?@E4BSzsbl_3RnFLNBL?a>b=^;p}c#p z@0oo$ZZC@($|R;q;)O|xxkj!)hB1sZ&%nIlHto!HnlvGu#l6NYDQkB+@~ILy7@UeP zKsUh?ua%aJVr5r$gGA27&XoL#A7A%`RC1|JPR;&4-Q?6td`Fc9JgdxDVDuM8u)r8n z@-w6@zey{<(jt&p7)2vYl8i)f#NMQp(m!6midv^`XyENp-Fzc?h&#hTAMgR9_4d7?GJcP z)p%;!5vu$`*qk-6H3}?Z<;4mZNCm>4w7I$0V1S(J3t)5Cz^bNE!;MNR?X9#fLDO9JlM}bm8V5)gTKgOe;g`p0my@p8uyzYr)37anQcWPCvMPR1( ztph?(sEe3@)ti!=N^cNfAIXNhIZ_0-A>xMH7|Dg(6e)z;{D#_pkY08w*o7@hd8dmb z#lu}ny!IAK@3bf|(Bc+T9}8P$NpeY~WVl;{ReZ{f5p}cr8%Drj-mHIwR$<$n?Mj_t zxlkG@9VQvpx+jvN$rg6dKP3(7q0zOY>yjZT?1ap{MA`8I`nUoUDT(ALSXddKPr(G8 z|N2MR1w1mX1|zG#F%Cv{H`_VGt^w__CuIc7aH%*_R?v2(Er*GjTlWA{hoB$sTNjO> z#vlV`?;FI-{I-`Z0!O!Lkp$dW9XW+$^;XY%E+0|%np%%J(uv&tw_-l=p-URi5)L;f(WR z=f|MsKHox~u!G$C^DQ`Zj*x*)t0PW1!cIrk4eX1hNlt-NDl~hUyhHq#R3qcii0@J` zY^VoB(ggMMMZZYqp5}3OPB$q2A)vzc_KC@h7zvmC!rri>y;%z-opSm6rzZTp==HGk zdF6n<=XvySFzjk4#q=ep;;{u2M>{IR`pV|L;j&TrVt^*>%W!ZIn&)6s&^P9XYzQW$ z5Y5mWO4_uq;TD@FewKXgXee=N@8{y_YkL5Nt!Qt?EB$x2vDzE2#&?x>`_W%K3G=m=_-9*xqg5>8jWqD;K|+#1u5_GSrsV{DURVDFbPB0(zvGA*9==8YB_S)JRjR)vhu%nV1gA8h} zUTwPC+B7a+YTmkK%l2kk3o51hw@c%CUB9U|N^8b_d&)TfGR?f!R_9C4i*(tDzN<~J z{k=SS5qBSm@+4sPK>e5f$HsAb*at(NmUX5>&6twGW+Jyi7!3t6A)CV&LP41$eT06p=^8KG!HRyp&j8<*_ni9!sM<$uzq8+~Y966*2VW;pw*nGjuB8lviW{eKHlZQz$CC&6VDZZ%pMDM_U*gn#Y zJ_CIxJG;8qs7;au-N#S%zlgIG2f7FLA3M>dYE_c?eW%WJo<{8H&aR`~{e5b>9}!~5 z{?22^l&ECh>F$Hwr@MQB{dnhT4TrO<=XBSx?gIn8XO8dhKHVq2PK776j%X(=*5hZ6 z_4h!T)17p8_jPri1ZZD>=V=wOWmJ&#HKG+cu;EQv$-f=7~aXDclKfSOo*jKp6D z8411)NeN`#d0ctLwPQF!@uuT7$J>3d3|uK_jTN-U3$|Z7_KAtJxDxs0^Zcz>Z-4#P z*XJXPr{np%W*i^c^FPKh4Od0XRq=M!{7AgAHM;$9+|~2ztb$wlZ|i6C<5|_e&MLZ9 z_?_ZaGnY{aqd#}aBN!takMJ|*2SufGm)^Yc+LeXPi(fJ`q9xJ9KD|zec^1p{ao}V|H?~&*h_)v@XOIx!ts}0jkUiT-x-PS zn2u)s0dlftA*ZsMH_yFxZlP?cC|-JCCL>Y4>CIPPdo{Xw&ky_Vmc~0y#LG|4I6rch zCCWC>ocOqQSE8sKCrjow%{D(a>g#Mc4sPLU>K3dkb-QDAyO)~dbw^g}PR8m^#_LYU ztNWssir+Zv5{3Kkw%u)r=AW49qGKw(*L&w$mmTGay1h|H4J;jU8dtKn#Im<6=0ta# zh-aUiF;nE`mF%WicGH5mxHF!;M~bUo$!?5gH!fUSyc*BmH)H-dyKD}=TIp-~N_J%| zyE0nUy7+uNyF-ffu4FgBTykMx=~6uVAd4$+S}ALdm9@spaJ`Pnnt8o<=HTqX2YCgz zhi(leip$^Dy<>gLy09tkX^EC^U2KRJ>`TBH^ec0R=MP778x#3u1UqtT%7 zJfJGwv{KR>D`}3Gw9Fh%6xGaMo-V6*Bg;W?=;6mae#KV(ywmmgk{s3u;+Fwam3a z;giW(b5|_IF-!5PPM1~qu&`pzhC5KIN*?-Dh*pLC#XtjJH9@$=8i zvQO?ZMt2!c@zyn3f6}Tm)e>wSCDjrPXVU4AHC;JG)L((ph3;NUBZd-mvhijPzSHuy z?xEA@Q1QqqA(GSM47jS>PkYUis`-XFY}x}#5mo!m<3zb&rhiHfFm-$Z2UvKV_s&#` zlJ;28nzO-E<`gqhUa;cOj_r&C+0@ve8;3T?wItf!c;f z2OhNJVJHGBq!5IK78vJNDk1HCM-_TbI{=Yx1i06a09X>#Xc!cqrj&uoXkDK^qvW~i z+7d_uLy}rJ^VOQ~c?X3J-f$*&EhE=%^ZS9vE%NxXhO%@>+;Mx+# z`Dvn5SjuA0MFLu73o8p`yXYh{ zjm59MMlp@OFgA4B5u7zL;?sS7vVfb+dF7HlHPtiQ-=w`$+I1g?t#XnVHSx^TV(E~z zKvxlx;WE2~jcv_gm>b^Gg-nSd?5KRw~+Py+;?*sbK-q2p4WMeW2O%M;;Vr8t(%Z4yMW+;w0&uUXe3;AfVMJp zl#a&22=y;SSxQH?t+&!Hw90MI1lTpO(WLZrTpyJhLoVW7a!6_xZj#1as{CFVbDzrx z2*S#4BRBx7cUHJf=`oN|yjnuaIwxtkG%|jXPPfAtn(gg~fZ{~cxKEIFfYnqi7db=% z#jlbxLk`gvX%qNe^0A#W>loU3h?mKseIl&7=g5B>PST+5bCeTs(uoyi`xP9JPHrW$ zeWMsop@H!s^*|of3bdD``6ZcSJLTP^QYCFeDgHl{s@8k0=b@|Uw)2*A-V%3tKjo~(LKqO)oi}%1-#wS{ zfxQOS(us_$n=f8}aW**r{LG8-jK)>4@P&(6t2$#w;e+PZ?@zruwUqTZz(=WvxenbgTDJ7lp|6ImIBxWjqch=hKQMt7OIMiCI1KISW^#R!`izlk%!t!U;H*7Y?m0 zTWTq@I<+-+Mca;jXgL1a)=z`aVXXao{&0@*hno%+8-B7U`!J5^*JmGY z(EqI1d)Q^ZZ=jF&ojmzndh+KO50_f+dyI#@=KBrC!`sbkjIdyOC2`8%!+<3FSny~6 zKzwit!A*M`aXNy_ZZqObA`50hpZG_3Lo(sy!k_(!x(ZL`s>b&4LXr1wua! zGo^^`WjfMNKGVxQd>J^f#*XSoOc4v&E!@bXb)Kq_#&= zk7kfGf?JT{px8ng*Pmer%h>5NW^YVqS0&4g&$WI=3V-3FGBA$ho8(iu7V*DQkntx41Oq?g!?+}>Eh^V}Yh{?w#UE?F_WdN+J#0bEHMAlRuLE;YF zbWWQJPp8J@o(qaDAQ=S#2#&x52BCz@D*BWwH}>;5)CIYy@aE;$F3-QZxIbRDGhVc7 z#`cl35c~m_tFpR)n`0)LPWykYUg3O|o zjIvlp*<5hp`DjL2JY)MRXS5bBfvYgA;VN!9o*3}~JV8m_N^wK1xFKHL1Q{75J?_Ii zmN#qDOc(A5oE84*t3cN9LHUkEb=^YFTRpRmxxNovki!84H)`S>b|h}!oX9F&$*PQH zRnE6A=0~$C<5|10yk+$8OQ#;V+_$Z_taA+uJY5g`q3bZoqk4E%Aa&sG$cK);PxeuZ ze1;=n8!xhsWfu;_6+E${=$}x4_v&&F=nX%% z6m^@8Khb*+>@xqPf#QF%lPCW!J^A+-yA9U=Xf}4ctl-7JMgxQB_8|V$hZa{FZI6Z^ z3X*;fKwDN*P94bvUrh&(ve4j1IIwi83l+|c(cL(-xJEeG-|Fc|3PJ|lgrZNmNk?1S zXz^7AQCbdcp}`QN2gIC8%AT}KD;79yVN6+)(k!Zi^^W?(FYHEiJG&8GC;zm8F~u5L zr=9-APasHe%1L#0}~$R7wm)I?H51{mBns>xTU*MZGb=xjEFTjl+tFZAWrUDyl~r2x;{?_soIOKsaqOIO^}z)nitn2y6@yF z{i5X#hU-i8*l>kbcEu;CqAj$eO>RdB6UlOW811?K5%P)Y5*_KkqV>wrdUg^m0-r<1 z2YL-bpCe*t?3^Xy0Gu6g8Hml8<;6`Y?VN6G6ohCoVUK37th55Vg#dy!Fp%9WA27=i z2WD|0o70AAFl8)UBs0*iXXFM;W!M%;8!kckP`r^PJku(|Y8_AqYaf3VgaN zFC&?-ayA9{|BYWiSEjQXQhv-}%mC?$sP{P_cDEXv zqH0L7TVIRgi`R&X)cmF@cXAM*{*JkS%iJF`_s$@GZDLS2ZMljcslg`T@8q!UDq`6e zhx}zTezW3T{Ef)JTVa0{TO8>pa@PND#f^xxFnR{p)Eb1e;?Ef?B6Ep*v4?1I@{j~?YRN__j=Pdfzu7IQLuT(S7N ze6?}ZF9dvzgTh$jD0t(+%f4~-k|t)8<+~D)3)3vWw>E8QraPFL54CLH3cIkdyLmmI zyD!ucwl>SvY-V!7e(~q%+i)ps3KGrAU!@BtWD)Hpsr&anM<%IcIfqR`Z@7*Lb=y2F zDQxf7=cXFiGNvc&TK_#Nz>$$7TH&Qed}LIvDC1}FE>(Jc~phAnLK zi7pV?zNqYg{2{eo5b?@Qp3IVdHEKZpv7TVqBTLcKCL~#J*sCC=#+1#Sfm5|~hH4=o z-RTZCG+5*AA7|w6vnih6*Dt2`EJtwxkVmDziNi4fY z!|TTFuaXuV;haAt?Sg zipgXfWDE~>TUb!qPCDrtFu7}C&=gE&DqS}a@&&I1CBq`Syo)FFbW2=tawHTKvBu(5 zs~jTLgSyd7@ko1416=?o)`ACo1AC6(Uw}NZA8&yjhdcj|L${}HO?~IpdHf!NH=a{} z?bt)h^DuWXW?*;eshb;)*0nF4U+RsP9EC1EQC>SYykL!%Y(?NfR_^R@Jgef`k%S?0 z#o&$^+;alrmkm{_XtX)n(z$Hd|FP4(;v{Q?1^2SE=?Qm|H$tvT?&7A8TOeXSc7G_| za%N@onHeiC_)@QU?OMilsyXf_b^=XJw(8=FYofK1t;2(?n)$$DVLWTkihWPizK4=C zLk8UJvFteV7}sdYzuC~dY}xX#dDq|Tb(OY6?dGKeKRWz_!*|bp*na9p*Q{ao988x! zbX3BwqtZ5GhcQB>?Gs#UUo7vW6Rz`+6lY`tVMY!a_e6{LE<5(gUtiz$X4`9R^MNIP zxwws8OzVEq^0{)iafMZHI$v`xSmFgOe{VF@*dDsvbT!)4^{KhzA2_{azLIsb=i5EF zc(%qilL;e=8XKxxza6FCWJ?s+EZOcieX#3=e|iMv++>3x4TsmKIB?9$$tq4(P37^9 zQ_GIi=*E-$X?~_RQP%h=XU;gp&)Od5*G8evDq3(X9%h~EK8h~SDgkUw^Wu@k_NB>a zOLwfgd)EFjZb`Avb${qOxauUtkKNo6 z{uKY8%l^D9kyVb;9pN)!k7ZUE0 z-{hB4eDAH^L;>Q_Ke=T5lUs^Ta>G>SZQ;Y5&C5HEB-}-FM`0_pGaChI$^L{KfaDh6 zK6>jYB|V30+H&(gsX>_@H&FkwhTOjF*zv@t)4K7opyP~%bA|IqmJQ7h3`KLhmkl)! z444VahKdJTI?pXaA!}7a_o6r1i)kp|`hYNfh?Z5xjhGDi^ zN-@9xbO$>7cb{gWLsvIZd;M;;4h8*ewT~snPa$a~`GooZA()Mc{f*A{-WLAeUjFz_ z)4hFq_}|yLdhzoj+5GV~)B8Ckenf!|?tdBm6(++=&0NoA2dJKhAB1|0kBb zUcKoj`>jGEMOR%x&#;nSNGgAh&YA z1yO%q&-dn<{(N&Mq5X?;C%H|0Z-M16nhoUN)&%!2D+%{sHuAkihQDm;v?1_UZ9G2w zRR<6M&pGn{TyG@T!eZ=p@)xl9A{J9(>TNRrypr#&as0f>O#XT^(k3kf14Q=*2E2T) z_c<{it;E)pq#0(_wET-*pqi2m>`UZ3OAdZFSN@R=P)RH^beGde;38Ym@M}3!B-|89 z2riO%MI`Q6+(r&9Z{h=TehDXOSByvlLsODvu#I9SCxX5q2z?2T_6bSnpmZY_JF4Tm zI4C+;YAiH?!LISKp}?i2p{MuYiKG=j;c<-pE+<=4lO`#UbPf#o#&C%$`(dNtzyKMw zj!uw?=>I{Pd_>NN^f~#K~DECqWKdck`(7_+?;@EzL;-{q~T^*5)3-(7`7B z9Ezk?AEiWpLeBTd`M2c!2o7Cv3BK$xg-Puxt-ZANP=}BS_`rZTgs<;Px^uQd;aXK( z1vEM0-z&BO692JEVX4CJfARbSF7ub1{g>og+5e1Ra#@J^HCOp7uHe^PQ;ci+8_xMl z&hblfoxkF|zvgOwqj&wv;C;yDTr;n5IWaCLVavMNaJ?a6FZes7o~K{>DcU@jFI~j| zE6{jh`!&;ynJnt!TssIzLFKGS_dP@_kHzzkM>#hxyqc?|-!O~Tv@c!!(Z~-*;zdWJ zTz^68RMpV!EU2`;m7e z@%(neURpbM`pvVion5fSOIxE{5$OEfR$5S^oA$+vpNn!Ncy&0T3vOKqF104?nX-4= zCq{ja?wWJek;S{`s#ZC?<|kKbcg1RVvA|OQ-Kx96yLCT!>An!_IQ@tKR_n3>)xS`+ z5L~Ex3x;|KE``|ZzTpvtR_lua;a=tNS{!7rCE*c;R@?LWoOwjywcvk5uhne_c;3AL zXnHSJKVt7sdUb7ltCq{65UXihrx@DaUMkMRK=pF8iK7ZzF<^$WY+8jDpx zx=oRH(Ju79U-gK+S6?vh<9qq}>Q#>33)>#C_v!&%Bj3;CexaO&!G(gidRI9L-!(|@ z`|d|9y!uuC3eRWE9-2FF>-Zy%{Hy2noWU^@{y<-tFgULrA~pFf!*@&{8uGES8j6^q G$o~fyRaGef literal 0 HcmV?d00001 diff --git a/scrape.py b/scrape.py index 08b7e24..f8e1be3 100644 --- a/scrape.py +++ b/scrape.py @@ -14,7 +14,6 @@ import aiofiles import httpx import markdown import mdformat -from bs4 import BeautifulSoup from markdownify import MarkdownConverter # pyright: ignore[reportMissingTypeStubs] from markupsafe import Markup, escape @@ -28,6 +27,21 @@ logging.basicConfig( logger: logging.Logger = logging.getLogger("wutheringwaves") +# Compile regex patterns for better performance +DISCORD_LINK_PATTERN = re.compile(r'\[([^\]]+)\]\((https?://[^\s)]+) "\2"\)') +SQUARE_BRACKETS_PATTERN = re.compile(r"^\s*\[([^\]]+)\]\s*$", re.MULTILINE) +BALL_PATTERN = re.compile(r"●\s*(.*?)\n", re.MULTILINE) +REFERENCE_MARK_PATTERN = re.compile(r"^\s*※\s*(\S.*?)\s*$", re.MULTILINE) +ESCAPED_STAR_PATTERN = re.compile(r"\\\*(.*)", re.MULTILINE) +NON_BREAKING_SPACE_PATTERN = re.compile(r"[\xa0 ]") # noqa: RUF001 +EMPTY_CODE_BLOCK_PATTERN = re.compile(r"```[ \t]*\n[ \t]*\n```") + +# Circled number patterns +CIRCLED_NUMBERS = { + "①": "1", "②": "2", "③": "3", "④": "4", "⑤": "5", + "⑥": "6", "⑦": "7", "⑧": "8", "⑨": "9", "⑩": "10", +} + async def fetch_json(url: str, client: httpx.AsyncClient) -> dict[Any, Any] | None: """Fetch JSON data from a URL. @@ -325,13 +339,7 @@ def format_discord_links(md: str) -> str: # Before: [Link](https://example.com "Link") # After: [Link](https://example.com) - formatted_links_md: str = re.sub( - pattern=r'\[([^\]]+)\]\((https?://[^\s)]+) "\2"\)', - repl=repl, - string=md, - ) - - return formatted_links_md + return DISCORD_LINK_PATTERN.sub(repl, md) def handle_stars(text: str) -> str: @@ -422,80 +430,36 @@ def generate_atom_feed(articles: list[dict[Any, Any]], file_name: str) -> str: logger.warning(msg) article_content_converted = "No content available" - # Remove non-breaking spaces - xa0_removed: str = re.sub( - r"\xa0", " ", article_content_converted - ) # Replace non-breaking spaces with regular spaces - - # Replace non-breaking spaces with regular spaces - non_breaking_space_removed: str = xa0_removed.replace( - " ", # noqa: RUF001 - " ", - ) - - # Remove code blocks that has only spaces and newlines inside them - empty_code_block_removed: str = re.sub( - pattern=r"```[ \t]*\n[ \t]*\n```", - repl="", - string=non_breaking_space_removed, # type: ignore # noqa: PGH003 - ) - + # Combine non-breaking space replacements in one pass + content = NON_BREAKING_SPACE_PATTERN.sub(" ", article_content_converted) + + # Remove empty code blocks + content = EMPTY_CODE_BLOCK_PATTERN.sub("", content) + # [How to Update] should be # How to Update - square_brackets_converted: str = re.sub( - pattern=r"^\s*\[([^\]]+)\]\s*$", - repl=r"# \1", - string=empty_code_block_removed, # type: ignore # noqa: PGH003 - flags=re.MULTILINE, - ) - - stars_converted: str = handle_stars(square_brackets_converted) - - # If `● Word` is in the content, replace it `## Word` instead with regex - ball_converted: str = re.sub( - pattern=r"●\s*(.*?)\n", - repl=r"\n\n## \1\n\n", - string=stars_converted, - flags=re.MULTILINE, - ) - - # If `※ Word` is in the content, replace it `* word * ` instead with regex - reference_mark_converted: str = re.sub( - pattern=r"^\s*※\s*(\S.*?)\s*$", - repl=r"\n\n*\1*\n\n", - string=ball_converted, - flags=re.MULTILINE, - ) - - # Replace circled Unicode numbers (①-⑳) with plain numbered text (e.g., "1. ", "2. ", ..., "20. ") - number_symbol: dict[str, str] = { - "①": "1", - "②": "2", - "③": "3", - "④": "4", - "⑤": "5", - "⑥": "6", - "⑦": "7", - "⑧": "8", - "⑨": "9", - "⑩": "10", - } - for symbol, number in number_symbol.items(): - reference_mark_converted = re.sub( + content = SQUARE_BRACKETS_PATTERN.sub(r"# \1", content) + + content = handle_stars(content) + + # If `● Word` is in the content, replace it `## Word` instead + content = BALL_PATTERN.sub(r"\n\n## \1\n\n", content) + + # If `※ Word` is in the content, replace it `* word * ` instead + content = REFERENCE_MARK_PATTERN.sub(r"\n\n*\1*\n\n", content) + + # Replace circled Unicode numbers with plain numbered text + for symbol, number in CIRCLED_NUMBERS.items(): + content = re.sub( pattern=rf"^\s*{re.escape(symbol)}\s*(.*?)\s*$", repl=rf"\n\n{number}. \1\n\n", - string=reference_mark_converted, + string=content, flags=re.MULTILINE, ) - - space_before_star_added: str = re.sub( - pattern=r"\\\*(.*)", - repl=r"* \1", - string=reference_mark_converted, - flags=re.MULTILINE, - ) + + content = ESCAPED_STAR_PATTERN.sub(r"* \1", content) markdown_formatted: str = mdformat.text( # type: ignore # noqa: PGH003 - space_before_star_added, + content, options={ "number": True, # Allow 1., 2., 3. numbering }, @@ -556,7 +520,7 @@ def generate_atom_feed(articles: list[dict[Any, Any]], file_name: str) -> str: html_file: Path = html_dir / f"{article_id}.html" if not html_file.is_file(): with html_file.open("w", encoding="utf-8") as f: - f.write(str(BeautifulSoup(html, "html.parser").prettify())) + f.write(html) logger.info("Saved HTML for article %s to %s", article_id, html_file) # Set the file timestamp