From 0d44632d019fb60b75a1d5c3c2f69d0fe33a7d79 Mon Sep 17 00:00:00 2001 From: Tobias Gruetzmacher Date: Sat, 10 Jun 2023 20:03:56 +0200 Subject: [PATCH] Fix complex image extraction in Unsounded This also adds a test to ensure this extraction continues working in the future. --- dosagelib/plugins/u.py | 29 +++++++++++++++++------ tests/responses/unsounded-17-137.html.gz | Bin 0 -> 717 bytes tests/responses/unsounded-17-92.html.gz | Bin 0 -> 669 bytes tests/responses/unsounded-root.html.gz | Bin 0 -> 8766 bytes tests/test_modules.py | 22 ++++++++++++++--- 5 files changed, 41 insertions(+), 10 deletions(-) create mode 100644 tests/responses/unsounded-17-137.html.gz create mode 100644 tests/responses/unsounded-17-92.html.gz create mode 100644 tests/responses/unsounded-root.html.gz diff --git a/dosagelib/plugins/u.py b/dosagelib/plugins/u.py index 99e31d682..c26c96ff4 100644 --- a/dosagelib/plugins/u.py +++ b/dosagelib/plugins/u.py @@ -4,9 +4,9 @@ # SPDX-FileCopyrightText: © 2015 Tobias Gruetzmacher # SPDX-FileCopyrightText: © 2019 Daniel Ring import json +import re +from contextlib import suppress from re import compile -from urllib.parse import urljoin -from lxml import etree from ..scraper import BasicScraper, ParserScraper from ..helpers import indirectStarter @@ -89,15 +89,30 @@ class Unsounded(ParserScraper): latestSearch = '//div[@id="chapter_box"][1]//a[last()]' multipleImagesPerStrip = True starter = indirectStarter + style_bg_regex = re.compile(r'background-image: url\((.*pageart/.*)\)') help = 'Index format: chapter-page' def extract_image_urls(self, url, data): - imageUrls = super().extract_image_urls(url, data) + urls = [] + with suppress(ValueError): + urls.extend(super().extract_image_urls(url, data)) # Include background for multi-image pages - imageRegex = compile(r'background-image: url\((pageart/.*)\)') - for match in imageRegex.finditer(str(etree.tostring(data))): - imageUrls.append(normaliseURL(urljoin(data[1], match.group(1)))) - return imageUrls + cssbg = self.extract_css_bg(data) + if cssbg: + urls.append(cssbg) + if not urls: + raise ValueError(f'No comic found at {url!r}') + return urls + + def extract_css_bg(self, page) -> str | None: + comicdivs = page.xpath('//div[@id="comic"]') + if comicdivs: + style = comicdivs[0].attrib.get('style') + if style: + hit = self.style_bg_regex.search(style) + if hit: + return hit.group(1) + return None def namer(self, image_url, page_url): filename = image_url.rsplit('/', 1)[-1] diff --git a/tests/responses/unsounded-17-137.html.gz b/tests/responses/unsounded-17-137.html.gz new file mode 100644 index 0000000000000000000000000000000000000000..b69a444512f0a188bb8b50e527c9a02756ee5a81 GIT binary patch literal 717 zcmV;;0y6y{iwFoUVS{7>19fh5Z*^{DWn?WeH!U$UH!f&&ZEOIoR&8(7FcALCeuWhz z1VW9MbT5#ml?DTe4;x6Rf^U`UxQUr#N4DE8|DLn+(zPA1u6(NR&VHWrle=@%!j@uk zg3^MK41Q56xV%G}iE&_TEtn}7vjHkJTZ{u*M3*zFN=`#+On_`%v2kG8y~W-xS;h$* zq?$1qAnXAxw}QD9n%;i@I=#LAbd8b(eN)<^8>Nj^dd?(xqT!|6kGlgYZj(yqDyuyL zN;AGfJcCq}v`ktT34ExJxI;`g*^h%cp?A4fOPLLLNphB;r53OK(%_{uF2b@|b7&R~ z9}F&=J43!*_f!DF+Dg7$ zW`Zp&G|{3DF^@n@P1tg literal 0 HcmV?d00001 diff --git a/tests/responses/unsounded-17-92.html.gz b/tests/responses/unsounded-17-92.html.gz new file mode 100644 index 0000000000000000000000000000000000000000..3b28c44be4b808843d937af7256e87ceb15c5912 GIT binary patch literal 669 zcmV;O0%H9iiwFq4V1r}=19fh5Z*^{DWn?WeH!V3bE@*UZYyhQIO^?$s5IxghVHpX5 zP-EHxt7g+mg$0SjE=Z_?Tje-zV(P?^?QWZYkH3;`+Ai9qr+CKBd-KLKI4ZA?bPsNin)w5m7P{5v3r|O-*M|)4fK{E@{TF`H@IU zD+rJS)J$_~S73Sf^ZWAd=F1I;WAH;r4Q_>0TF8`E#xot;JJa{X7Z)K_A2MG>>3ax7uBedJ?B7)Tx^LxfQW>sWVqGhGT zw#sORa4NPY3xs2(9NdZ&SegXbSZC002XL^kRv<$7e<@v>-!ik__ld2)WA;|d{MlL; zGw+1tS&uV#0_ToxSV#HGZe2`SGDWD_sG7(YWRKkLb=>nMx_nfsp)Ui3=A z;lSX1cAy<;!pa;dNuDlI7qp}bDUD^eRT{Ap=f-yNM0J&icIoHQD>$s24A+-4m}0WY zr9~e)tQc%1e;r_5U931=>s6XZf9m`Vbn>l-2oC37fJZ8^%vm+mqK-$?Nxi=g4tqHM z@ZLC}*Ln&*9b0e%7vjSYC&hH)Z*%^jab5BLQ$cN3Sel9AAjUrUK;TpJ%q&Qj7e{Gu zuE+A=WDiGIj|um9N6On8dwN)@&{~w8LL1gv2z@%e{6W(SC*#YfXw>f+OWoD!Cu5vc zRSR6QEaPF1bj1N{&$Obrk+nkO!^|$LhekXbba62Ex}iBpYQmdNFwUS z-B|nYf^#z{-2(z%y&Qdwi*MF#)? Dh%G^d literal 0 HcmV?d00001 diff --git a/tests/responses/unsounded-root.html.gz b/tests/responses/unsounded-root.html.gz new file mode 100644 index 0000000000000000000000000000000000000000..7a0731d2c6ed6415d3f6b1091676c0d170a13b2f GIT binary patch literal 8766 zcmdU#c~sJQzsIMs$7xe1tFbb3GP7*a%#d=O%4(?8%mE1#%3V=NaYJ0{wwRhZX=y^~ zXr?9L0xE83<(4T@Ibf)0X@ZJrieR{LsaXoip8MQ$&vWkc$B)l>zj%G$-|x@kIUE-4 zxJ8S0oW9`ptx^yQgAPKuxx4KMK%)Z#a&H!L}w&l+~0uH}IpD_H)ie8xt+V~1rgSL9GNJ8JJA5cIB}(>U#5 zVkPnCmqOVR3Vdd$MZ=07_7gl(V%#*4!y0iDnWk%bRMRDWm!isB@Agn4DPbDJZVhqX zU^0Rm-ODNR?OH*c_&vc4NyVHkoot|TYtW(F8iK5l5>DKlH0`xE7ybxIMorGKUG7PDS6qGB@T0;@>?Eb0iq5LVhkG<2GDfO#P|EqQYa zm~Lj3o(7OsMJ};3NcLn1rpNTV!m{~{ZJlMJ5$$%nD=cQU<;!q%m)xO|IX z2y71tvR~iV#7nAZ`)z4G*m*r5{a{nl(AmN#_z62PDl#o9oldY3484_9O4eC6bw&DG z9x|v}{XA)NTW_~T+P%@qx@daf%$c(VDUgwxA$UiPxZF>>W28iB=(i5l!Wxv><7?$_ zSDOu&Oud{znLV;K^=n7^wF^p`cLXta7}0*Wpw)JsnCMi;9OYt;&I`#6oneA9i>U#| zOghHGqfLzKJ2_X4Er_im&1)LzlVfm;*eMp|p-)%Ty}( zCT^&j;WCxfjshI&tN2P;d?&y23ManY6X+6rcbF4@i+djzywL+Vn8y7o_6Y7BW|kRY zG((tv&7JjJjHWTof1q+Ls1-bu zfwYc`Wj5t}lQXEK;j;O0?a*t&!OjTwy({*KPwY=D)CcM6>u283rQBF~=7_rYk=Q+( zV$3%k{4LQsGg0RRNXG%RJXiO6Zq^lWVk6k(QS1?j$@AD-eR`*i2A=A{jU-R?P8%(5 z(sLexnz&Q19TvBx~g)or|D| zq;OQGol(;wmlr|m)Jb1KsCBc$8zcR1Q+3r-S6)9>-r}}pA6{yKi+IN!yI=l3L-Odr z(`(N!$SyI(b$mC@f}D#jyv@dn16WPJ2qyU(Fl8ww@6a6p0~Gk2&iu<(hhVYWM+ z-oKiRD9uzt#;TI^O$)O7AL8Z1-QwKO87=nORH3&_9C(P&Rzl)cN#3R~B0;rs4x1x< z^yHXa_>X8fRKXaI+I`0`xeja3%N%(*c4a(A`}tIBLVsG0&rb!}3m)N>7b4ewPxAf{ zV*cR_{Bd*<>)b_%%FkZ&4C^bIYkz%~yI?W-(>0Y^9z`?VgQfkWL=xdYkU@ zmsvmizIf=?N%++%+~<%_lC^J=4h^JJ3cB8@yPTW#1eLf5eIg48ss-wg5p=)3y>hXw z`f}UYWt(HZ-+VAO$@*%NP6Mds{M3cBrQSPg%{MYZI?q7Y`*oKOWIYK?ToiO7%M?^= zsvmGCP;1CMw#fx?Ba0UKuKzjBB)jH2Kz7aB_vOMW(|XRSyF7QL3i-(MXY-7BlJ(8Z46y!M*rySszc%6cJufeC>}ANuaVYKH{0)D$ z$5+~aYLNA{Cufd+^LN8vMY^PaUi1VmS2PLH4~vq}s!S#Eb=|M_)pei7jRHgQbV47; z&fT6lMtQ1gH&$nEI6|6%8&D^xlF{wB;`+GCTne8m>B(v)AW}`x#7?-9D%)Q}(@3!VspK~J zC`ZU;p)8_kG=Q6-5t6_f( zZ1Ptc{%jvR^r@i=tklI>AC7T-7Iob9S$Eg!aWrC8Idxm8{SSQ3)Jb6vUk8^u13C4E z<8ALz-4_=(Lahc$Z$~~&VNGzFT8FUMR6|P5E@l@ab}PbS`xJ6ip9$}i7SoUS=Oaq@ zFF-E&2T6ZrLH6;7c=?b~oU6d(?2O{_;|~e)i{=Rbv|$pyysLBD<2hS@YVjg({nFV! z7YtUuQ#}0V!vCV7e+0fNMf(Tr%xiP$SySij)`brRz`AeQE0;nYwiiKGeHf>03eN%l zVsfF@GB55@=*6u+>?gWxD4J*J7~9wF_X2Of2>E;To5fz5(*7;%(z9I0MdXj;3iPOG zNd`Fod)QY-;oZSi`-$)kMIVOXV8&ZfX6U(c_)3_w;SzG5#wAEw+y{PrS$8@7cYkWH zlM}w{0u0|JPRjjykOXnn4mI4eoH3&?S{lB8L0p=_CrD zaUEcFGGa4XClZS@Zgr|Y%UH(VZ47N~K5b)s0U#faA;Ahv&OU|=GQI$mUo=PfH#Cg+ zcnn9Kd*D%!jemsqQAXyjAl zpy%`$@4FZ&AMUYpKe8c)uYSN)wCXPL`ju`&ermhKdDY=Oes~zA^yzGJ)YB#^i)LCm z(V;V&oS(llA3l4-Ge6dyRhfr7ag{Dwn&NWWCiXl)Uzwe?8k!^@iXitCn4Db%iMq3_ zt9CVXh3jdXtLFhm%IrCT3|3@vHkpOkd;z#unf*q#m8|93dBEO!#}Zud)`o}pbxO#9 ztL)qlV;QN@s6g*eCKs-<2Q`~RSGb+F`SmqE_`gCGg-dk+9YTT=&jUcp?C_h*zH%KKdOgVtFyGHTrAkH_L_>fn2Z)5G z3>KsB*&r{#D+sfM>t*DsO)g%d9m@|8(J`xG5l3sE4La#u{!6*eQo!S6bj-_)=mZDrouD@Wn)U{d zwS_k7Wj3*|^i=Bjp4LE+8&Ll-VYj+*e7|1Iklw-2B-kn1Sl>xCo=TrbW?E}L$C4uE3!DS;a82x&hXxYQEy(1kY@2`7wR-?7=WqDXUYPyW23fl^$uGDjPT>si zmxw;?UWZqBe(tt~B^(vQ`H%W>Bk#{|c8K8tWS2#g{HD}k9b#I(-;^EIv31ZjNb25$ z!21p7LV3h$^6<_nK{yQ6-6C!?>2_ai+7RF1Ig36Cz1&O*I$%QlvdXsD!i6Ee0_fPt_Dz6N6~apx-J<8q!poTk z!o9On=IHR^TrG8E@XsX9hC+|%TD(Gd!XDI8M}JkAr=e%q1L_}QOe(F2q4y5+X5gLN zW!r{Pmsf=VYho)1XX?0&g1?;iPWKo-UjzAyT3S)YsU0b;z)F>xS&VoSM&v}EqLj)p0)9*`QoJ=H4c%J0(WDY^*piroBGJ(*R z^O`Vmq-rMa`XRL=gu@-JyyvY$JEw}b*ixz)Opl;o*bR;_V6uTVQzo1;$64j4V8QNa z#|hi)xS@jW{PgTNA1y1yv8G8%GfnDz9(PyvYP!eOrspak#+a?!0!@++G{`0eCTG(i zn_EKXEc#3k0*;cw4~*GavKCWUo58C9#Xx2~Fz%X1Ku|9Nuoxt`>nZ?i%zndwCIMY- z6aoer)O8j3RpDQ2K=T-CvrdwU%WCaa(H*pr*R4Z*Z;25iuj;bkZNs&DQ(W2mKSUmT zd@Bwxa|=kAnxa6{{1B9Afd;Cv9=}Nwne0nCNh$P@4=6HBfqiBtH4wh>AIk}C%jH^T zrL0Gr^bc?7dd0>3vJ}YP-U+3ib!9w`2mBRikR1w4UJ$b%-wBEwRC&?M(BWECYgXbe z`i#`$>6%C}io~fX^!PIjvvU>L9HFbLLR-uzpHNmawNWcc&;HJi%C8F$(SGY|>b*9? znsI{vfW4`ej1>!~#{(n7%kEP{4vI6e#vRPo3!}Lsw?cXJla5&o|lx;jVKuYEPKKJa&epVgo!uR5cVFrBsc{>?tH*ukGQ+V*C4w?$(l zUG|X~Fh%G-eXmSv@7(Q;1zIs_>EmGnjp3|lz-9~5-hnd=kQ&l942pSCgHKw|zCB*R z%W8uKeU`o?!jU478hXw$d65jBxBpME7?M8zj`u%0NR|Ye#B;U5Cjh)H*`I|-WDq_9 zD9=6<=&3xbJ$1HfFuxaKmR@1|c7p`L0Hr1J!L>Nb@caLgvL<{_@!4oWK)Jb7Ut4P~ zd{f6;yBiEOp8H-z8W+Q|lzr1;@f}jUj>N$fcrYK~5j|lRM<_Lc@ z;4ng2%-m!d9;2R2P4AjQoSl6+InpGIBNfdn1fqgiB1fHPjF#c?XT+rn6^h5|IdTV1?b5ZONR938wZKPSOL) zi`qPm#q~DkM}{Dz$`JHx7g!3`f2Q0J%N@Pi4TqoF9hHpk5)d~}Pfl)7!@giR#c!sY zp6C>2p&4&NP4cDh1GW-#o>_L()ah7SdkE`5sfJVuMKXb9Sx$LlUhTP)%s33W?hLRS z@D&-AOgcqUcBQ9VNw%f5V;492^s6S_eVKGCr%%w2*tK~!HSk___GhQeFTUe%*} zCpYelBm$+6(9=a;mSODb-a(i0Q_|n3$-5bO7rvu}AthUA`2t55E zKcr}|P-~Y(Unrtktve=u(ba`Ae(Q}*ZnGK=-vc|Aq8&oHpz*ebExi4zHbPL=2QS)irZ~@#1;LGNmME~ zivA>uAsyki<@fQCHN?`H2g~jH$9mjgd{&Cx0}_5hOrW9jrAdYOddCA!3HNjJA=)=( zKaQ9Qjo2L(cH58;@>s2*E^|w|hcwO3*49oOwP#1NsMlxq@a*oRIE`xQv*z~|y*)jH z(OvNTbS((=pfE#RS^A`!nS4+P6Y_47XJ;P>XwxZ&PYSC&D-PE+vIZBEW{tz%^-CAb N