main.py 204 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864186518661867186818691870187118721873187418751876187718781879188018811882188318841885188618871888188918901891189218931894189518961897189818991900190119021903190419051906190719081909191019111912191319141915191619171918191919201921192219231924192519261927192819291930193119321933193419351936193719381939194019411942194319441945194619471948194919501951195219531954195519561957195819591960196119621963196419651966196719681969197019711972197319741975197619771978197919801981198219831984198519861987198819891990199119921993199419951996199719981999200020012002200320042005200620072008200920102011201220132014201520162017201820192020202120222023202420252026202720282029203020312032203320342035203620372038203920402041204220432044204520462047204820492050205120522053205420552056205720582059206020612062206320642065206620672068206920702071207220732074207520762077207820792080208120822083208420852086208720882089209020912092209320942095209620972098209921002101210221032104210521062107210821092110211121122113211421152116211721182119212021212122212321242125212621272128212921302131213221332134213521362137213821392140214121422143214421452146214721482149215021512152215321542155215621572158215921602161216221632164216521662167216821692170217121722173217421752176217721782179218021812182218321842185218621872188218921902191219221932194219521962197219821992200220122022203220422052206220722082209221022112212221322142215221622172218221922202221222222232224222522262227222822292230223122322233223422352236223722382239224022412242224322442245224622472248224922502251225222532254225522562257225822592260226122622263226422652266226722682269227022712272227322742275227622772278227922802281228222832284228522862287228822892290229122922293229422952296229722982299230023012302230323042305230623072308230923102311231223132314231523162317231823192320232123222323232423252326232723282329233023312332233323342335233623372338233923402341234223432344234523462347234823492350235123522353235423552356235723582359236023612362236323642365236623672368236923702371237223732374237523762377237823792380238123822383238423852386238723882389239023912392239323942395239623972398239924002401240224032404240524062407240824092410241124122413241424152416241724182419242024212422242324242425242624272428242924302431243224332434243524362437243824392440244124422443244424452446244724482449245024512452245324542455245624572458245924602461246224632464246524662467246824692470247124722473247424752476247724782479248024812482248324842485248624872488248924902491249224932494249524962497249824992500250125022503250425052506250725082509251025112512251325142515251625172518251925202521252225232524252525262527252825292530253125322533253425352536253725382539254025412542254325442545254625472548254925502551255225532554255525562557255825592560256125622563256425652566256725682569257025712572257325742575257625772578257925802581258225832584258525862587258825892590259125922593259425952596259725982599260026012602260326042605260626072608260926102611261226132614261526162617261826192620262126222623262426252626262726282629263026312632263326342635263626372638263926402641264226432644264526462647264826492650265126522653265426552656265726582659266026612662266326642665266626672668266926702671267226732674267526762677267826792680268126822683268426852686268726882689269026912692269326942695269626972698269927002701270227032704270527062707270827092710271127122713271427152716271727182719272027212722272327242725272627272728272927302731273227332734273527362737273827392740274127422743274427452746274727482749275027512752275327542755275627572758275927602761276227632764276527662767276827692770277127722773277427752776277727782779278027812782278327842785278627872788278927902791279227932794279527962797279827992800280128022803280428052806280728082809281028112812281328142815281628172818281928202821282228232824282528262827282828292830283128322833283428352836283728382839284028412842284328442845284628472848284928502851285228532854285528562857285828592860286128622863286428652866286728682869287028712872287328742875287628772878287928802881288228832884288528862887288828892890289128922893289428952896289728982899290029012902290329042905290629072908290929102911291229132914291529162917291829192920292129222923292429252926292729282929293029312932293329342935293629372938293929402941294229432944294529462947294829492950295129522953295429552956295729582959296029612962296329642965296629672968296929702971297229732974297529762977297829792980298129822983298429852986298729882989299029912992299329942995299629972998299930003001300230033004300530063007300830093010301130123013301430153016301730183019302030213022302330243025302630273028302930303031303230333034303530363037303830393040304130423043304430453046304730483049305030513052305330543055305630573058305930603061306230633064306530663067306830693070307130723073307430753076307730783079308030813082308330843085308630873088308930903091309230933094309530963097309830993100310131023103310431053106310731083109311031113112311331143115311631173118311931203121312231233124312531263127312831293130313131323133313431353136313731383139314031413142314331443145314631473148314931503151315231533154315531563157315831593160316131623163316431653166316731683169317031713172317331743175317631773178317931803181318231833184318531863187318831893190319131923193319431953196319731983199320032013202320332043205320632073208320932103211321232133214321532163217321832193220322132223223322432253226322732283229323032313232323332343235323632373238323932403241324232433244324532463247324832493250325132523253325432553256325732583259326032613262326332643265326632673268326932703271327232733274327532763277327832793280328132823283328432853286328732883289329032913292329332943295329632973298329933003301330233033304330533063307330833093310331133123313331433153316331733183319332033213322332333243325332633273328332933303331333233333334333533363337333833393340334133423343334433453346334733483349335033513352335333543355335633573358335933603361336233633364336533663367336833693370337133723373337433753376337733783379338033813382338333843385338633873388338933903391339233933394339533963397339833993400340134023403340434053406340734083409341034113412341334143415341634173418341934203421342234233424342534263427342834293430343134323433343434353436343734383439344034413442344334443445344634473448344934503451345234533454345534563457345834593460346134623463346434653466346734683469347034713472347334743475347634773478347934803481348234833484348534863487348834893490349134923493349434953496349734983499350035013502350335043505350635073508350935103511351235133514351535163517351835193520352135223523352435253526352735283529353035313532353335343535353635373538353935403541354235433544354535463547354835493550355135523553355435553556355735583559356035613562356335643565356635673568356935703571357235733574357535763577357835793580358135823583358435853586358735883589359035913592359335943595359635973598359936003601360236033604360536063607360836093610361136123613361436153616361736183619362036213622362336243625362636273628362936303631363236333634363536363637363836393640364136423643364436453646364736483649365036513652365336543655365636573658365936603661366236633664366536663667366836693670367136723673367436753676367736783679368036813682368336843685368636873688368936903691369236933694369536963697369836993700370137023703370437053706370737083709371037113712371337143715371637173718371937203721372237233724372537263727372837293730373137323733373437353736373737383739374037413742374337443745374637473748374937503751375237533754375537563757375837593760376137623763376437653766376737683769377037713772377337743775377637773778377937803781378237833784378537863787378837893790379137923793379437953796379737983799380038013802380338043805380638073808380938103811381238133814381538163817381838193820382138223823382438253826382738283829383038313832383338343835383638373838383938403841384238433844384538463847384838493850385138523853385438553856385738583859386038613862386338643865386638673868386938703871387238733874387538763877387838793880388138823883388438853886388738883889389038913892389338943895389638973898389939003901390239033904390539063907390839093910391139123913391439153916391739183919392039213922392339243925392639273928392939303931393239333934393539363937393839393940394139423943394439453946394739483949395039513952395339543955395639573958395939603961396239633964396539663967396839693970397139723973397439753976397739783979398039813982398339843985398639873988398939903991399239933994399539963997399839994000400140024003400440054006400740084009401040114012401340144015401640174018401940204021402240234024402540264027402840294030403140324033403440354036403740384039404040414042404340444045404640474048404940504051405240534054405540564057405840594060406140624063406440654066406740684069407040714072407340744075407640774078407940804081408240834084408540864087408840894090409140924093409440954096409740984099410041014102410341044105410641074108410941104111411241134114411541164117411841194120412141224123412441254126412741284129413041314132413341344135413641374138413941404141414241434144414541464147414841494150415141524153415441554156415741584159416041614162416341644165416641674168416941704171417241734174417541764177417841794180418141824183418441854186418741884189419041914192419341944195419641974198419942004201420242034204420542064207420842094210421142124213421442154216421742184219422042214222422342244225422642274228422942304231423242334234423542364237423842394240424142424243424442454246424742484249425042514252425342544255425642574258425942604261426242634264426542664267426842694270427142724273427442754276427742784279428042814282428342844285428642874288428942904291429242934294429542964297429842994300430143024303430443054306430743084309431043114312431343144315431643174318431943204321432243234324432543264327432843294330433143324333433443354336433743384339434043414342434343444345434643474348434943504351435243534354435543564357435843594360436143624363436443654366436743684369437043714372437343744375437643774378437943804381438243834384438543864387438843894390439143924393439443954396439743984399440044014402440344044405440644074408440944104411441244134414441544164417441844194420442144224423442444254426442744284429443044314432443344344435443644374438443944404441444244434444444544464447444844494450445144524453445444554456445744584459446044614462446344644465446644674468446944704471447244734474447544764477447844794480448144824483448444854486448744884489449044914492449344944495449644974498449945004501450245034504450545064507450845094510451145124513451445154516451745184519452045214522452345244525452645274528452945304531453245334534453545364537453845394540454145424543454445454546454745484549455045514552455345544555455645574558455945604561456245634564456545664567456845694570457145724573457445754576457745784579458045814582458345844585458645874588458945904591459245934594459545964597459845994600460146024603460446054606460746084609461046114612461346144615461646174618461946204621462246234624462546264627462846294630463146324633463446354636463746384639464046414642464346444645464646474648464946504651465246534654465546564657465846594660466146624663466446654666466746684669467046714672467346744675467646774678467946804681468246834684468546864687468846894690469146924693469446954696469746984699470047014702470347044705470647074708470947104711471247134714471547164717471847194720472147224723472447254726472747284729473047314732473347344735473647374738473947404741474247434744474547464747474847494750475147524753475447554756475747584759476047614762476347644765476647674768476947704771477247734774477547764777477847794780478147824783478447854786478747884789479047914792479347944795479647974798479948004801480248034804480548064807480848094810481148124813481448154816481748184819482048214822482348244825482648274828482948304831483248334834483548364837483848394840484148424843484448454846484748484849485048514852485348544855485648574858485948604861486248634864486548664867486848694870487148724873487448754876487748784879488048814882488348844885488648874888488948904891489248934894489548964897489848994900490149024903490449054906490749084909491049114912491349144915491649174918491949204921492249234924492549264927492849294930493149324933493449354936493749384939494049414942494349444945494649474948494949504951495249534954495549564957495849594960496149624963496449654966496749684969497049714972497349744975497649774978497949804981498249834984498549864987498849894990499149924993499449954996499749984999500050015002500350045005500650075008500950105011501250135014501550165017501850195020502150225023502450255026502750285029503050315032503350345035503650375038503950405041504250435044504550465047504850495050505150525053505450555056505750585059506050615062506350645065506650675068506950705071507250735074507550765077507850795080508150825083508450855086508750885089509050915092509350945095509650975098509951005101510251035104510551065107510851095110511151125113511451155116511751185119512051215122512351245125512651275128512951305131513251335134513551365137513851395140514151425143514451455146514751485149515051515152515351545155515651575158515951605161516251635164516551665167516851695170517151725173517451755176517751785179518051815182518351845185518651875188518951905191519251935194519551965197519851995200520152025203520452055206520752085209521052115212521352145215521652175218521952205221522252235224522552265227522852295230523152325233523452355236523752385239524052415242524352445245524652475248524952505251525252535254525552565257525852595260526152625263526452655266526752685269527052715272527352745275527652775278527952805281528252835284528552865287528852895290529152925293529452955296529752985299530053015302530353045305530653075308530953105311531253135314531553165317531853195320532153225323532453255326532753285329533053315332533353345335533653375338533953405341534253435344534553465347534853495350535153525353535453555356535753585359536053615362536353645365536653675368536953705371537253735374537553765377537853795380538153825383538453855386538753885389539053915392539353945395539653975398539954005401540254035404540554065407540854095410541154125413541454155416541754185419542054215422542354245425542654275428542954305431
  1. # coding=utf-8
  2. import json
  3. import os
  4. import random
  5. import re
  6. import time
  7. import webbrowser
  8. import smtplib
  9. from email.mime.text import MIMEText
  10. from email.mime.multipart import MIMEMultipart
  11. from email.header import Header
  12. from email.utils import formataddr, formatdate, make_msgid
  13. from datetime import datetime
  14. from pathlib import Path
  15. from typing import Dict, List, Tuple, Optional, Union
  16. import pytz
  17. import requests
  18. import yaml
  19. VERSION = "3.5.0"
  20. # === SMTP邮件配置 ===
  21. SMTP_CONFIGS = {
  22. # Gmail(使用 STARTTLS)
  23. "gmail.com": {"server": "smtp.gmail.com", "port": 587, "encryption": "TLS"},
  24. # QQ邮箱(使用 SSL,更稳定)
  25. "qq.com": {"server": "smtp.qq.com", "port": 465, "encryption": "SSL"},
  26. # Outlook(使用 STARTTLS)
  27. "outlook.com": {
  28. "server": "smtp-mail.outlook.com",
  29. "port": 587,
  30. "encryption": "TLS",
  31. },
  32. "hotmail.com": {
  33. "server": "smtp-mail.outlook.com",
  34. "port": 587,
  35. "encryption": "TLS",
  36. },
  37. "live.com": {"server": "smtp-mail.outlook.com", "port": 587, "encryption": "TLS"},
  38. # 网易邮箱(使用 SSL,更稳定)
  39. "163.com": {"server": "smtp.163.com", "port": 465, "encryption": "SSL"},
  40. "126.com": {"server": "smtp.126.com", "port": 465, "encryption": "SSL"},
  41. # 新浪邮箱(使用 SSL)
  42. "sina.com": {"server": "smtp.sina.com", "port": 465, "encryption": "SSL"},
  43. # 搜狐邮箱(使用 SSL)
  44. "sohu.com": {"server": "smtp.sohu.com", "port": 465, "encryption": "SSL"},
  45. # 天翼邮箱(使用 SSL)
  46. "189.cn": {"server": "smtp.189.cn", "port": 465, "encryption": "SSL"},
  47. # 阿里云邮箱(使用 TLS)
  48. "aliyun.com": {"server": "smtp.aliyun.com", "port": 465, "encryption": "TLS"},
  49. }
  50. # === 多账号推送工具函数 ===
  51. def parse_multi_account_config(config_value: str, separator: str = ";") -> List[str]:
  52. """
  53. 解析多账号配置,返回账号列表
  54. Args:
  55. config_value: 配置值字符串,多个账号用分隔符分隔
  56. separator: 分隔符,默认为 ;
  57. Returns:
  58. 账号列表,空字符串会被保留(用于占位)
  59. """
  60. if not config_value:
  61. return []
  62. # 保留空字符串用于占位(如 ";token2" 表示第一个账号无token)
  63. accounts = [acc.strip() for acc in config_value.split(separator)]
  64. # 过滤掉全部为空的情况
  65. if all(not acc for acc in accounts):
  66. return []
  67. return accounts
  68. def validate_paired_configs(
  69. configs: Dict[str, List[str]],
  70. channel_name: str,
  71. required_keys: Optional[List[str]] = None
  72. ) -> Tuple[bool, int]:
  73. """
  74. 验证配对配置的数量是否一致
  75. Args:
  76. configs: 配置字典,key 为配置名,value 为账号列表
  77. channel_name: 渠道名称,用于日志输出
  78. required_keys: 必须有值的配置项列表
  79. Returns:
  80. (是否验证通过, 账号数量)
  81. """
  82. # 过滤掉空列表
  83. non_empty_configs = {k: v for k, v in configs.items() if v}
  84. if not non_empty_configs:
  85. return True, 0
  86. # 检查必须项
  87. if required_keys:
  88. for key in required_keys:
  89. if key not in non_empty_configs or not non_empty_configs[key]:
  90. return True, 0 # 必须项为空,视为未配置
  91. # 获取所有非空配置的长度
  92. lengths = {k: len(v) for k, v in non_empty_configs.items()}
  93. unique_lengths = set(lengths.values())
  94. if len(unique_lengths) > 1:
  95. print(f"❌ {channel_name} 配置错误:配对配置数量不一致,将跳过该渠道推送")
  96. for key, length in lengths.items():
  97. print(f" - {key}: {length} 个")
  98. return False, 0
  99. return True, list(unique_lengths)[0] if unique_lengths else 0
  100. def limit_accounts(
  101. accounts: List[str],
  102. max_count: int,
  103. channel_name: str
  104. ) -> List[str]:
  105. """
  106. 限制账号数量
  107. Args:
  108. accounts: 账号列表
  109. max_count: 最大账号数量
  110. channel_name: 渠道名称,用于日志输出
  111. Returns:
  112. 限制后的账号列表
  113. """
  114. if len(accounts) > max_count:
  115. print(f"⚠️ {channel_name} 配置了 {len(accounts)} 个账号,超过最大限制 {max_count},只使用前 {max_count} 个")
  116. print(f" ⚠️ 警告:如果您是 fork 用户,过多账号可能导致 GitHub Actions 运行时间过长,存在账号风险")
  117. return accounts[:max_count]
  118. return accounts
  119. def get_account_at_index(accounts: List[str], index: int, default: str = "") -> str:
  120. """
  121. 安全获取指定索引的账号值
  122. Args:
  123. accounts: 账号列表
  124. index: 索引
  125. default: 默认值
  126. Returns:
  127. 账号值或默认值
  128. """
  129. if index < len(accounts):
  130. return accounts[index] if accounts[index] else default
  131. return default
  132. # === 配置管理 ===
  133. def load_config():
  134. """加载配置文件"""
  135. config_path = os.environ.get("CONFIG_PATH", "config/config.yaml")
  136. if not Path(config_path).exists():
  137. raise FileNotFoundError(f"配置文件 {config_path} 不存在")
  138. with open(config_path, "r", encoding="utf-8") as f:
  139. config_data = yaml.safe_load(f)
  140. print(f"配置文件加载成功: {config_path}")
  141. # 构建配置
  142. config = {
  143. "VERSION_CHECK_URL": config_data["app"]["version_check_url"],
  144. "SHOW_VERSION_UPDATE": config_data["app"]["show_version_update"],
  145. "REQUEST_INTERVAL": config_data["crawler"]["request_interval"],
  146. "REPORT_MODE": os.environ.get("REPORT_MODE", "").strip()
  147. or config_data["report"]["mode"],
  148. "RANK_THRESHOLD": config_data["report"]["rank_threshold"],
  149. "SORT_BY_POSITION_FIRST": os.environ.get("SORT_BY_POSITION_FIRST", "").strip().lower()
  150. in ("true", "1")
  151. if os.environ.get("SORT_BY_POSITION_FIRST", "").strip()
  152. else config_data["report"].get("sort_by_position_first", False),
  153. "MAX_NEWS_PER_KEYWORD": int(
  154. os.environ.get("MAX_NEWS_PER_KEYWORD", "").strip() or "0"
  155. )
  156. or config_data["report"].get("max_news_per_keyword", 0),
  157. "REVERSE_CONTENT_ORDER": os.environ.get("REVERSE_CONTENT_ORDER", "").strip().lower()
  158. in ("true", "1")
  159. if os.environ.get("REVERSE_CONTENT_ORDER", "").strip()
  160. else config_data["report"].get("reverse_content_order", False),
  161. "USE_PROXY": config_data["crawler"]["use_proxy"],
  162. "DEFAULT_PROXY": config_data["crawler"]["default_proxy"],
  163. "ENABLE_CRAWLER": os.environ.get("ENABLE_CRAWLER", "").strip().lower()
  164. in ("true", "1")
  165. if os.environ.get("ENABLE_CRAWLER", "").strip()
  166. else config_data["crawler"]["enable_crawler"],
  167. "ENABLE_NOTIFICATION": os.environ.get("ENABLE_NOTIFICATION", "").strip().lower()
  168. in ("true", "1")
  169. if os.environ.get("ENABLE_NOTIFICATION", "").strip()
  170. else config_data["notification"]["enable_notification"],
  171. "MESSAGE_BATCH_SIZE": config_data["notification"]["message_batch_size"],
  172. "DINGTALK_BATCH_SIZE": config_data["notification"].get(
  173. "dingtalk_batch_size", 20000
  174. ),
  175. "FEISHU_BATCH_SIZE": config_data["notification"].get("feishu_batch_size", 29000),
  176. "BARK_BATCH_SIZE": config_data["notification"].get("bark_batch_size", 3600),
  177. "SLACK_BATCH_SIZE": config_data["notification"].get("slack_batch_size", 4000),
  178. "BATCH_SEND_INTERVAL": config_data["notification"]["batch_send_interval"],
  179. "FEISHU_MESSAGE_SEPARATOR": config_data["notification"][
  180. "feishu_message_separator"
  181. ],
  182. # 多账号配置
  183. "MAX_ACCOUNTS_PER_CHANNEL": int(
  184. os.environ.get("MAX_ACCOUNTS_PER_CHANNEL", "").strip() or "0"
  185. )
  186. or config_data["notification"].get("max_accounts_per_channel", 3),
  187. "PUSH_WINDOW": {
  188. "ENABLED": os.environ.get("PUSH_WINDOW_ENABLED", "").strip().lower()
  189. in ("true", "1")
  190. if os.environ.get("PUSH_WINDOW_ENABLED", "").strip()
  191. else config_data["notification"]
  192. .get("push_window", {})
  193. .get("enabled", False),
  194. "TIME_RANGE": {
  195. "START": os.environ.get("PUSH_WINDOW_START", "").strip()
  196. or config_data["notification"]
  197. .get("push_window", {})
  198. .get("time_range", {})
  199. .get("start", "08:00"),
  200. "END": os.environ.get("PUSH_WINDOW_END", "").strip()
  201. or config_data["notification"]
  202. .get("push_window", {})
  203. .get("time_range", {})
  204. .get("end", "22:00"),
  205. },
  206. "ONCE_PER_DAY": os.environ.get("PUSH_WINDOW_ONCE_PER_DAY", "").strip().lower()
  207. in ("true", "1")
  208. if os.environ.get("PUSH_WINDOW_ONCE_PER_DAY", "").strip()
  209. else config_data["notification"]
  210. .get("push_window", {})
  211. .get("once_per_day", True),
  212. "RECORD_RETENTION_DAYS": int(
  213. os.environ.get("PUSH_WINDOW_RETENTION_DAYS", "").strip() or "0"
  214. )
  215. or config_data["notification"]
  216. .get("push_window", {})
  217. .get("push_record_retention_days", 7),
  218. },
  219. "WEIGHT_CONFIG": {
  220. "RANK_WEIGHT": config_data["weight"]["rank_weight"],
  221. "FREQUENCY_WEIGHT": config_data["weight"]["frequency_weight"],
  222. "HOTNESS_WEIGHT": config_data["weight"]["hotness_weight"],
  223. },
  224. "PLATFORMS": config_data["platforms"],
  225. }
  226. # 通知渠道配置(环境变量优先)
  227. notification = config_data.get("notification", {})
  228. webhooks = notification.get("webhooks", {})
  229. config["FEISHU_WEBHOOK_URL"] = os.environ.get(
  230. "FEISHU_WEBHOOK_URL", ""
  231. ).strip() or webhooks.get("feishu_url", "")
  232. config["DINGTALK_WEBHOOK_URL"] = os.environ.get(
  233. "DINGTALK_WEBHOOK_URL", ""
  234. ).strip() or webhooks.get("dingtalk_url", "")
  235. config["WEWORK_WEBHOOK_URL"] = os.environ.get(
  236. "WEWORK_WEBHOOK_URL", ""
  237. ).strip() or webhooks.get("wework_url", "")
  238. config["WEWORK_MSG_TYPE"] = os.environ.get(
  239. "WEWORK_MSG_TYPE", ""
  240. ).strip() or webhooks.get("wework_msg_type", "markdown")
  241. config["TELEGRAM_BOT_TOKEN"] = os.environ.get(
  242. "TELEGRAM_BOT_TOKEN", ""
  243. ).strip() or webhooks.get("telegram_bot_token", "")
  244. config["TELEGRAM_CHAT_ID"] = os.environ.get(
  245. "TELEGRAM_CHAT_ID", ""
  246. ).strip() or webhooks.get("telegram_chat_id", "")
  247. # 邮件配置
  248. config["EMAIL_FROM"] = os.environ.get("EMAIL_FROM", "").strip() or webhooks.get(
  249. "email_from", ""
  250. )
  251. config["EMAIL_PASSWORD"] = os.environ.get(
  252. "EMAIL_PASSWORD", ""
  253. ).strip() or webhooks.get("email_password", "")
  254. config["EMAIL_TO"] = os.environ.get("EMAIL_TO", "").strip() or webhooks.get(
  255. "email_to", ""
  256. )
  257. config["EMAIL_SMTP_SERVER"] = os.environ.get(
  258. "EMAIL_SMTP_SERVER", ""
  259. ).strip() or webhooks.get("email_smtp_server", "")
  260. config["EMAIL_SMTP_PORT"] = os.environ.get(
  261. "EMAIL_SMTP_PORT", ""
  262. ).strip() or webhooks.get("email_smtp_port", "")
  263. # ntfy配置
  264. config["NTFY_SERVER_URL"] = (
  265. os.environ.get("NTFY_SERVER_URL", "").strip()
  266. or webhooks.get("ntfy_server_url")
  267. or "https://ntfy.sh"
  268. )
  269. config["NTFY_TOPIC"] = os.environ.get("NTFY_TOPIC", "").strip() or webhooks.get(
  270. "ntfy_topic", ""
  271. )
  272. config["NTFY_TOKEN"] = os.environ.get("NTFY_TOKEN", "").strip() or webhooks.get(
  273. "ntfy_token", ""
  274. )
  275. # Bark配置
  276. config["BARK_URL"] = os.environ.get("BARK_URL", "").strip() or webhooks.get(
  277. "bark_url", ""
  278. )
  279. # Slack配置
  280. config["SLACK_WEBHOOK_URL"] = os.environ.get("SLACK_WEBHOOK_URL", "").strip() or webhooks.get(
  281. "slack_webhook_url", ""
  282. )
  283. # 输出配置来源信息
  284. notification_sources = []
  285. max_accounts = config["MAX_ACCOUNTS_PER_CHANNEL"]
  286. if config["FEISHU_WEBHOOK_URL"]:
  287. accounts = parse_multi_account_config(config["FEISHU_WEBHOOK_URL"])
  288. count = min(len(accounts), max_accounts)
  289. source = "环境变量" if os.environ.get("FEISHU_WEBHOOK_URL") else "配置文件"
  290. notification_sources.append(f"飞书({source}, {count}个账号)")
  291. if config["DINGTALK_WEBHOOK_URL"]:
  292. accounts = parse_multi_account_config(config["DINGTALK_WEBHOOK_URL"])
  293. count = min(len(accounts), max_accounts)
  294. source = "环境变量" if os.environ.get("DINGTALK_WEBHOOK_URL") else "配置文件"
  295. notification_sources.append(f"钉钉({source}, {count}个账号)")
  296. if config["WEWORK_WEBHOOK_URL"]:
  297. accounts = parse_multi_account_config(config["WEWORK_WEBHOOK_URL"])
  298. count = min(len(accounts), max_accounts)
  299. source = "环境变量" if os.environ.get("WEWORK_WEBHOOK_URL") else "配置文件"
  300. notification_sources.append(f"企业微信({source}, {count}个账号)")
  301. if config["TELEGRAM_BOT_TOKEN"] and config["TELEGRAM_CHAT_ID"]:
  302. tokens = parse_multi_account_config(config["TELEGRAM_BOT_TOKEN"])
  303. chat_ids = parse_multi_account_config(config["TELEGRAM_CHAT_ID"])
  304. # 验证数量一致性
  305. valid, count = validate_paired_configs(
  306. {"bot_token": tokens, "chat_id": chat_ids},
  307. "Telegram",
  308. required_keys=["bot_token", "chat_id"]
  309. )
  310. if valid and count > 0:
  311. count = min(count, max_accounts)
  312. token_source = "环境变量" if os.environ.get("TELEGRAM_BOT_TOKEN") else "配置文件"
  313. notification_sources.append(f"Telegram({token_source}, {count}个账号)")
  314. if config["EMAIL_FROM"] and config["EMAIL_PASSWORD"] and config["EMAIL_TO"]:
  315. from_source = "环境变量" if os.environ.get("EMAIL_FROM") else "配置文件"
  316. notification_sources.append(f"邮件({from_source})")
  317. if config["NTFY_SERVER_URL"] and config["NTFY_TOPIC"]:
  318. topics = parse_multi_account_config(config["NTFY_TOPIC"])
  319. tokens = parse_multi_account_config(config["NTFY_TOKEN"])
  320. # ntfy 的 token 是可选的,但如果配置了,数量必须与 topic 一致
  321. if tokens:
  322. valid, count = validate_paired_configs(
  323. {"topic": topics, "token": tokens},
  324. "ntfy"
  325. )
  326. if valid and count > 0:
  327. count = min(count, max_accounts)
  328. server_source = "环境变量" if os.environ.get("NTFY_SERVER_URL") else "配置文件"
  329. notification_sources.append(f"ntfy({server_source}, {count}个账号)")
  330. else:
  331. count = min(len(topics), max_accounts)
  332. server_source = "环境变量" if os.environ.get("NTFY_SERVER_URL") else "配置文件"
  333. notification_sources.append(f"ntfy({server_source}, {count}个账号)")
  334. if config["BARK_URL"]:
  335. accounts = parse_multi_account_config(config["BARK_URL"])
  336. count = min(len(accounts), max_accounts)
  337. bark_source = "环境变量" if os.environ.get("BARK_URL") else "配置文件"
  338. notification_sources.append(f"Bark({bark_source}, {count}个账号)")
  339. if config["SLACK_WEBHOOK_URL"]:
  340. accounts = parse_multi_account_config(config["SLACK_WEBHOOK_URL"])
  341. count = min(len(accounts), max_accounts)
  342. slack_source = "环境变量" if os.environ.get("SLACK_WEBHOOK_URL") else "配置文件"
  343. notification_sources.append(f"Slack({slack_source}, {count}个账号)")
  344. if notification_sources:
  345. print(f"通知渠道配置来源: {', '.join(notification_sources)}")
  346. print(f"每个渠道最大账号数: {max_accounts}")
  347. else:
  348. print("未配置任何通知渠道")
  349. return config
  350. print("正在加载配置...")
  351. CONFIG = load_config()
  352. print(f"TrendRadar v{VERSION} 配置加载完成")
  353. print(f"监控平台数量: {len(CONFIG['PLATFORMS'])}")
  354. # === 工具函数 ===
  355. def get_beijing_time():
  356. """获取北京时间"""
  357. return datetime.now(pytz.timezone("Asia/Shanghai"))
  358. def format_date_folder():
  359. """格式化日期文件夹"""
  360. return get_beijing_time().strftime("%Y年%m月%d日")
  361. def format_time_filename():
  362. """格式化时间文件名"""
  363. return get_beijing_time().strftime("%H时%M分")
  364. def clean_title(title: str) -> str:
  365. """清理标题中的特殊字符"""
  366. if not isinstance(title, str):
  367. title = str(title)
  368. cleaned_title = title.replace("\n", " ").replace("\r", " ")
  369. cleaned_title = re.sub(r"\s+", " ", cleaned_title)
  370. cleaned_title = cleaned_title.strip()
  371. return cleaned_title
  372. def ensure_directory_exists(directory: str):
  373. """确保目录存在"""
  374. Path(directory).mkdir(parents=True, exist_ok=True)
  375. def get_output_path(subfolder: str, filename: str) -> str:
  376. """获取输出路径"""
  377. date_folder = format_date_folder()
  378. output_dir = Path("output") / date_folder / subfolder
  379. ensure_directory_exists(str(output_dir))
  380. return str(output_dir / filename)
  381. def check_version_update(
  382. current_version: str, version_url: str, proxy_url: Optional[str] = None
  383. ) -> Tuple[bool, Optional[str]]:
  384. """检查版本更新"""
  385. try:
  386. proxies = None
  387. if proxy_url:
  388. proxies = {"http": proxy_url, "https": proxy_url}
  389. headers = {
  390. "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36",
  391. "Accept": "text/plain, */*",
  392. "Cache-Control": "no-cache",
  393. }
  394. response = requests.get(
  395. version_url, proxies=proxies, headers=headers, timeout=10
  396. )
  397. response.raise_for_status()
  398. remote_version = response.text.strip()
  399. print(f"当前版本: {current_version}, 远程版本: {remote_version}")
  400. # 比较版本
  401. def parse_version(version_str):
  402. try:
  403. parts = version_str.strip().split(".")
  404. if len(parts) != 3:
  405. raise ValueError("版本号格式不正确")
  406. return int(parts[0]), int(parts[1]), int(parts[2])
  407. except:
  408. return 0, 0, 0
  409. current_tuple = parse_version(current_version)
  410. remote_tuple = parse_version(remote_version)
  411. need_update = current_tuple < remote_tuple
  412. return need_update, remote_version if need_update else None
  413. except Exception as e:
  414. print(f"版本检查失败: {e}")
  415. return False, None
  416. def is_first_crawl_today() -> bool:
  417. """检测是否是当天第一次爬取"""
  418. date_folder = format_date_folder()
  419. txt_dir = Path("output") / date_folder / "txt"
  420. if not txt_dir.exists():
  421. return True
  422. files = sorted([f for f in txt_dir.iterdir() if f.suffix == ".txt"])
  423. return len(files) <= 1
  424. def html_escape(text: str) -> str:
  425. """HTML转义"""
  426. if not isinstance(text, str):
  427. text = str(text)
  428. return (
  429. text.replace("&", "&amp;")
  430. .replace("<", "&lt;")
  431. .replace(">", "&gt;")
  432. .replace('"', "&quot;")
  433. .replace("'", "&#x27;")
  434. )
  435. # === 推送记录管理 ===
  436. class PushRecordManager:
  437. """推送记录管理器"""
  438. def __init__(self):
  439. self.record_dir = Path("output") / ".push_records"
  440. self.ensure_record_dir()
  441. self.cleanup_old_records()
  442. def ensure_record_dir(self):
  443. """确保记录目录存在"""
  444. self.record_dir.mkdir(parents=True, exist_ok=True)
  445. def get_today_record_file(self) -> Path:
  446. """获取今天的记录文件路径"""
  447. today = get_beijing_time().strftime("%Y%m%d")
  448. return self.record_dir / f"push_record_{today}.json"
  449. def cleanup_old_records(self):
  450. """清理过期的推送记录"""
  451. retention_days = CONFIG["PUSH_WINDOW"]["RECORD_RETENTION_DAYS"]
  452. current_time = get_beijing_time()
  453. for record_file in self.record_dir.glob("push_record_*.json"):
  454. try:
  455. date_str = record_file.stem.replace("push_record_", "")
  456. file_date = datetime.strptime(date_str, "%Y%m%d")
  457. file_date = pytz.timezone("Asia/Shanghai").localize(file_date)
  458. if (current_time - file_date).days > retention_days:
  459. record_file.unlink()
  460. print(f"清理过期推送记录: {record_file.name}")
  461. except Exception as e:
  462. print(f"清理记录文件失败 {record_file}: {e}")
  463. def has_pushed_today(self) -> bool:
  464. """检查今天是否已经推送过"""
  465. record_file = self.get_today_record_file()
  466. if not record_file.exists():
  467. return False
  468. try:
  469. with open(record_file, "r", encoding="utf-8") as f:
  470. record = json.load(f)
  471. return record.get("pushed", False)
  472. except Exception as e:
  473. print(f"读取推送记录失败: {e}")
  474. return False
  475. def record_push(self, report_type: str):
  476. """记录推送"""
  477. record_file = self.get_today_record_file()
  478. now = get_beijing_time()
  479. record = {
  480. "pushed": True,
  481. "push_time": now.strftime("%Y-%m-%d %H:%M:%S"),
  482. "report_type": report_type,
  483. }
  484. try:
  485. with open(record_file, "w", encoding="utf-8") as f:
  486. json.dump(record, f, ensure_ascii=False, indent=2)
  487. print(f"推送记录已保存: {report_type} at {now.strftime('%H:%M:%S')}")
  488. except Exception as e:
  489. print(f"保存推送记录失败: {e}")
  490. def is_in_time_range(self, start_time: str, end_time: str) -> bool:
  491. """检查当前时间是否在指定时间范围内"""
  492. now = get_beijing_time()
  493. current_time = now.strftime("%H:%M")
  494. def normalize_time(time_str: str) -> str:
  495. """将时间字符串标准化为 HH:MM 格式"""
  496. try:
  497. parts = time_str.strip().split(":")
  498. if len(parts) != 2:
  499. raise ValueError(f"时间格式错误: {time_str}")
  500. hour = int(parts[0])
  501. minute = int(parts[1])
  502. if not (0 <= hour <= 23 and 0 <= minute <= 59):
  503. raise ValueError(f"时间范围错误: {time_str}")
  504. return f"{hour:02d}:{minute:02d}"
  505. except Exception as e:
  506. print(f"时间格式化错误 '{time_str}': {e}")
  507. return time_str
  508. normalized_start = normalize_time(start_time)
  509. normalized_end = normalize_time(end_time)
  510. normalized_current = normalize_time(current_time)
  511. result = normalized_start <= normalized_current <= normalized_end
  512. if not result:
  513. print(f"时间窗口判断:当前 {normalized_current},窗口 {normalized_start}-{normalized_end}")
  514. return result
  515. # === 数据获取 ===
  516. class DataFetcher:
  517. """数据获取器"""
  518. def __init__(self, proxy_url: Optional[str] = None):
  519. self.proxy_url = proxy_url
  520. def fetch_data(
  521. self,
  522. id_info: Union[str, Tuple[str, str]],
  523. max_retries: int = 2,
  524. min_retry_wait: int = 3,
  525. max_retry_wait: int = 5,
  526. ) -> Tuple[Optional[str], str, str]:
  527. """获取指定ID数据,支持重试"""
  528. if isinstance(id_info, tuple):
  529. id_value, alias = id_info
  530. else:
  531. id_value = id_info
  532. alias = id_value
  533. url = f"https://newsnow.busiyi.world/api/s?id={id_value}&latest"
  534. proxies = None
  535. if self.proxy_url:
  536. proxies = {"http": self.proxy_url, "https": self.proxy_url}
  537. headers = {
  538. "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
  539. "Accept": "application/json, text/plain, */*",
  540. "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8",
  541. "Connection": "keep-alive",
  542. "Cache-Control": "no-cache",
  543. }
  544. retries = 0
  545. while retries <= max_retries:
  546. try:
  547. response = requests.get(
  548. url, proxies=proxies, headers=headers, timeout=10
  549. )
  550. response.raise_for_status()
  551. data_text = response.text
  552. data_json = json.loads(data_text)
  553. status = data_json.get("status", "未知")
  554. if status not in ["success", "cache"]:
  555. raise ValueError(f"响应状态异常: {status}")
  556. status_info = "最新数据" if status == "success" else "缓存数据"
  557. print(f"获取 {id_value} 成功({status_info})")
  558. return data_text, id_value, alias
  559. except Exception as e:
  560. retries += 1
  561. if retries <= max_retries:
  562. base_wait = random.uniform(min_retry_wait, max_retry_wait)
  563. additional_wait = (retries - 1) * random.uniform(1, 2)
  564. wait_time = base_wait + additional_wait
  565. print(f"请求 {id_value} 失败: {e}. {wait_time:.2f}秒后重试...")
  566. time.sleep(wait_time)
  567. else:
  568. print(f"请求 {id_value} 失败: {e}")
  569. return None, id_value, alias
  570. return None, id_value, alias
  571. def crawl_websites(
  572. self,
  573. ids_list: List[Union[str, Tuple[str, str]]],
  574. request_interval: int = CONFIG["REQUEST_INTERVAL"],
  575. ) -> Tuple[Dict, Dict, List]:
  576. """爬取多个网站数据"""
  577. results = {}
  578. id_to_name = {}
  579. failed_ids = []
  580. for i, id_info in enumerate(ids_list):
  581. if isinstance(id_info, tuple):
  582. id_value, name = id_info
  583. else:
  584. id_value = id_info
  585. name = id_value
  586. id_to_name[id_value] = name
  587. response, _, _ = self.fetch_data(id_info)
  588. if response:
  589. try:
  590. data = json.loads(response)
  591. results[id_value] = {}
  592. for index, item in enumerate(data.get("items", []), 1):
  593. title = item.get("title")
  594. # 跳过无效标题(None、float、空字符串)
  595. if title is None or isinstance(title, float) or not str(title).strip():
  596. continue
  597. title = str(title).strip()
  598. url = item.get("url", "")
  599. mobile_url = item.get("mobileUrl", "")
  600. if title in results[id_value]:
  601. results[id_value][title]["ranks"].append(index)
  602. else:
  603. results[id_value][title] = {
  604. "ranks": [index],
  605. "url": url,
  606. "mobileUrl": mobile_url,
  607. }
  608. except json.JSONDecodeError:
  609. print(f"解析 {id_value} 响应失败")
  610. failed_ids.append(id_value)
  611. except Exception as e:
  612. print(f"处理 {id_value} 数据出错: {e}")
  613. failed_ids.append(id_value)
  614. else:
  615. failed_ids.append(id_value)
  616. if i < len(ids_list) - 1:
  617. actual_interval = request_interval + random.randint(-10, 20)
  618. actual_interval = max(50, actual_interval)
  619. time.sleep(actual_interval / 1000)
  620. print(f"成功: {list(results.keys())}, 失败: {failed_ids}")
  621. return results, id_to_name, failed_ids
  622. # === 数据处理 ===
  623. def save_titles_to_file(results: Dict, id_to_name: Dict, failed_ids: List) -> str:
  624. """保存标题到文件"""
  625. file_path = get_output_path("txt", f"{format_time_filename()}.txt")
  626. with open(file_path, "w", encoding="utf-8") as f:
  627. for id_value, title_data in results.items():
  628. # id | name 或 id
  629. name = id_to_name.get(id_value)
  630. if name and name != id_value:
  631. f.write(f"{id_value} | {name}\n")
  632. else:
  633. f.write(f"{id_value}\n")
  634. # 按排名排序标题
  635. sorted_titles = []
  636. for title, info in title_data.items():
  637. cleaned_title = clean_title(title)
  638. if isinstance(info, dict):
  639. ranks = info.get("ranks", [])
  640. url = info.get("url", "")
  641. mobile_url = info.get("mobileUrl", "")
  642. else:
  643. ranks = info if isinstance(info, list) else []
  644. url = ""
  645. mobile_url = ""
  646. rank = ranks[0] if ranks else 1
  647. sorted_titles.append((rank, cleaned_title, url, mobile_url))
  648. sorted_titles.sort(key=lambda x: x[0])
  649. for rank, cleaned_title, url, mobile_url in sorted_titles:
  650. line = f"{rank}. {cleaned_title}"
  651. if url:
  652. line += f" [URL:{url}]"
  653. if mobile_url:
  654. line += f" [MOBILE:{mobile_url}]"
  655. f.write(line + "\n")
  656. f.write("\n")
  657. if failed_ids:
  658. f.write("==== 以下ID请求失败 ====\n")
  659. for id_value in failed_ids:
  660. f.write(f"{id_value}\n")
  661. return file_path
  662. def load_frequency_words(
  663. frequency_file: Optional[str] = None,
  664. ) -> Tuple[List[Dict], List[str], List[str]]:
  665. """
  666. 加载频率词配置
  667. Returns:
  668. (词组列表, 词组内过滤词, 全局过滤词)
  669. """
  670. if frequency_file is None:
  671. frequency_file = os.environ.get(
  672. "FREQUENCY_WORDS_PATH", "config/frequency_words.txt"
  673. )
  674. frequency_path = Path(frequency_file)
  675. if not frequency_path.exists():
  676. raise FileNotFoundError(f"频率词文件 {frequency_file} 不存在")
  677. with open(frequency_path, "r", encoding="utf-8") as f:
  678. content = f.read()
  679. word_groups = [group.strip() for group in content.split("\n\n") if group.strip()]
  680. processed_groups = []
  681. filter_words = []
  682. global_filters = [] # 新增:全局过滤词列表
  683. # 默认区域(向后兼容)
  684. current_section = "WORD_GROUPS"
  685. for group in word_groups:
  686. lines = [line.strip() for line in group.split("\n") if line.strip()]
  687. if not lines:
  688. continue
  689. # 检查是否为区域标记
  690. if lines[0].startswith("[") and lines[0].endswith("]"):
  691. section_name = lines[0][1:-1].upper()
  692. if section_name in ("GLOBAL_FILTER", "WORD_GROUPS"):
  693. current_section = section_name
  694. lines = lines[1:] # 移除标记行
  695. # 处理全局过滤区域
  696. if current_section == "GLOBAL_FILTER":
  697. # 直接添加所有非空行到全局过滤列表
  698. for line in lines:
  699. # 忽略特殊语法前缀,只提取纯文本
  700. if line.startswith(("!", "+", "@")):
  701. continue # 全局过滤区不支持特殊语法
  702. if line:
  703. global_filters.append(line)
  704. continue
  705. # 处理词组区域(保持现有逻辑)
  706. words = lines
  707. group_required_words = []
  708. group_normal_words = []
  709. group_filter_words = []
  710. group_max_count = 0 # 默认不限制
  711. for word in words:
  712. if word.startswith("@"):
  713. # 解析最大显示数量(只接受正整数)
  714. try:
  715. count = int(word[1:])
  716. if count > 0:
  717. group_max_count = count
  718. except (ValueError, IndexError):
  719. pass # 忽略无效的@数字格式
  720. elif word.startswith("!"):
  721. filter_words.append(word[1:])
  722. group_filter_words.append(word[1:])
  723. elif word.startswith("+"):
  724. group_required_words.append(word[1:])
  725. else:
  726. group_normal_words.append(word)
  727. if group_required_words or group_normal_words:
  728. if group_normal_words:
  729. group_key = " ".join(group_normal_words)
  730. else:
  731. group_key = " ".join(group_required_words)
  732. processed_groups.append(
  733. {
  734. "required": group_required_words,
  735. "normal": group_normal_words,
  736. "group_key": group_key,
  737. "max_count": group_max_count, # 新增字段
  738. }
  739. )
  740. return processed_groups, filter_words, global_filters
  741. def parse_file_titles(file_path: Path) -> Tuple[Dict, Dict]:
  742. """解析单个txt文件的标题数据,返回(titles_by_id, id_to_name)"""
  743. titles_by_id = {}
  744. id_to_name = {}
  745. with open(file_path, "r", encoding="utf-8") as f:
  746. content = f.read()
  747. sections = content.split("\n\n")
  748. for section in sections:
  749. if not section.strip() or "==== 以下ID请求失败 ====" in section:
  750. continue
  751. lines = section.strip().split("\n")
  752. if len(lines) < 2:
  753. continue
  754. # id | name 或 id
  755. header_line = lines[0].strip()
  756. if " | " in header_line:
  757. parts = header_line.split(" | ", 1)
  758. source_id = parts[0].strip()
  759. name = parts[1].strip()
  760. id_to_name[source_id] = name
  761. else:
  762. source_id = header_line
  763. id_to_name[source_id] = source_id
  764. titles_by_id[source_id] = {}
  765. for line in lines[1:]:
  766. if line.strip():
  767. try:
  768. title_part = line.strip()
  769. rank = None
  770. # 提取排名
  771. if ". " in title_part and title_part.split(". ")[0].isdigit():
  772. rank_str, title_part = title_part.split(". ", 1)
  773. rank = int(rank_str)
  774. # 提取 MOBILE URL
  775. mobile_url = ""
  776. if " [MOBILE:" in title_part:
  777. title_part, mobile_part = title_part.rsplit(" [MOBILE:", 1)
  778. if mobile_part.endswith("]"):
  779. mobile_url = mobile_part[:-1]
  780. # 提取 URL
  781. url = ""
  782. if " [URL:" in title_part:
  783. title_part, url_part = title_part.rsplit(" [URL:", 1)
  784. if url_part.endswith("]"):
  785. url = url_part[:-1]
  786. title = clean_title(title_part.strip())
  787. ranks = [rank] if rank is not None else [1]
  788. titles_by_id[source_id][title] = {
  789. "ranks": ranks,
  790. "url": url,
  791. "mobileUrl": mobile_url,
  792. }
  793. except Exception as e:
  794. print(f"解析标题行出错: {line}, 错误: {e}")
  795. return titles_by_id, id_to_name
  796. def read_all_today_titles(
  797. current_platform_ids: Optional[List[str]] = None,
  798. ) -> Tuple[Dict, Dict, Dict]:
  799. """读取当天所有标题文件,支持按当前监控平台过滤"""
  800. date_folder = format_date_folder()
  801. txt_dir = Path("output") / date_folder / "txt"
  802. if not txt_dir.exists():
  803. return {}, {}, {}
  804. all_results = {}
  805. final_id_to_name = {}
  806. title_info = {}
  807. files = sorted([f for f in txt_dir.iterdir() if f.suffix == ".txt"])
  808. for file_path in files:
  809. time_info = file_path.stem
  810. titles_by_id, file_id_to_name = parse_file_titles(file_path)
  811. if current_platform_ids is not None:
  812. filtered_titles_by_id = {}
  813. filtered_id_to_name = {}
  814. for source_id, title_data in titles_by_id.items():
  815. if source_id in current_platform_ids:
  816. filtered_titles_by_id[source_id] = title_data
  817. if source_id in file_id_to_name:
  818. filtered_id_to_name[source_id] = file_id_to_name[source_id]
  819. titles_by_id = filtered_titles_by_id
  820. file_id_to_name = filtered_id_to_name
  821. final_id_to_name.update(file_id_to_name)
  822. for source_id, title_data in titles_by_id.items():
  823. process_source_data(
  824. source_id, title_data, time_info, all_results, title_info
  825. )
  826. return all_results, final_id_to_name, title_info
  827. def process_source_data(
  828. source_id: str,
  829. title_data: Dict,
  830. time_info: str,
  831. all_results: Dict,
  832. title_info: Dict,
  833. ) -> None:
  834. """处理来源数据,合并重复标题"""
  835. if source_id not in all_results:
  836. all_results[source_id] = title_data
  837. if source_id not in title_info:
  838. title_info[source_id] = {}
  839. for title, data in title_data.items():
  840. ranks = data.get("ranks", [])
  841. url = data.get("url", "")
  842. mobile_url = data.get("mobileUrl", "")
  843. title_info[source_id][title] = {
  844. "first_time": time_info,
  845. "last_time": time_info,
  846. "count": 1,
  847. "ranks": ranks,
  848. "url": url,
  849. "mobileUrl": mobile_url,
  850. }
  851. else:
  852. for title, data in title_data.items():
  853. ranks = data.get("ranks", [])
  854. url = data.get("url", "")
  855. mobile_url = data.get("mobileUrl", "")
  856. if title not in all_results[source_id]:
  857. all_results[source_id][title] = {
  858. "ranks": ranks,
  859. "url": url,
  860. "mobileUrl": mobile_url,
  861. }
  862. title_info[source_id][title] = {
  863. "first_time": time_info,
  864. "last_time": time_info,
  865. "count": 1,
  866. "ranks": ranks,
  867. "url": url,
  868. "mobileUrl": mobile_url,
  869. }
  870. else:
  871. existing_data = all_results[source_id][title]
  872. existing_ranks = existing_data.get("ranks", [])
  873. existing_url = existing_data.get("url", "")
  874. existing_mobile_url = existing_data.get("mobileUrl", "")
  875. merged_ranks = existing_ranks.copy()
  876. for rank in ranks:
  877. if rank not in merged_ranks:
  878. merged_ranks.append(rank)
  879. all_results[source_id][title] = {
  880. "ranks": merged_ranks,
  881. "url": existing_url or url,
  882. "mobileUrl": existing_mobile_url or mobile_url,
  883. }
  884. title_info[source_id][title]["last_time"] = time_info
  885. title_info[source_id][title]["ranks"] = merged_ranks
  886. title_info[source_id][title]["count"] += 1
  887. if not title_info[source_id][title].get("url"):
  888. title_info[source_id][title]["url"] = url
  889. if not title_info[source_id][title].get("mobileUrl"):
  890. title_info[source_id][title]["mobileUrl"] = mobile_url
  891. def detect_latest_new_titles(current_platform_ids: Optional[List[str]] = None) -> Dict:
  892. """检测当日最新批次的新增标题,支持按当前监控平台过滤"""
  893. date_folder = format_date_folder()
  894. txt_dir = Path("output") / date_folder / "txt"
  895. if not txt_dir.exists():
  896. return {}
  897. files = sorted([f for f in txt_dir.iterdir() if f.suffix == ".txt"])
  898. if len(files) < 2:
  899. return {}
  900. # 解析最新文件
  901. latest_file = files[-1]
  902. latest_titles, _ = parse_file_titles(latest_file)
  903. # 如果指定了当前平台列表,过滤最新文件数据
  904. if current_platform_ids is not None:
  905. filtered_latest_titles = {}
  906. for source_id, title_data in latest_titles.items():
  907. if source_id in current_platform_ids:
  908. filtered_latest_titles[source_id] = title_data
  909. latest_titles = filtered_latest_titles
  910. # 汇总历史标题(按平台过滤)
  911. historical_titles = {}
  912. for file_path in files[:-1]:
  913. historical_data, _ = parse_file_titles(file_path)
  914. # 过滤历史数据
  915. if current_platform_ids is not None:
  916. filtered_historical_data = {}
  917. for source_id, title_data in historical_data.items():
  918. if source_id in current_platform_ids:
  919. filtered_historical_data[source_id] = title_data
  920. historical_data = filtered_historical_data
  921. for source_id, titles_data in historical_data.items():
  922. if source_id not in historical_titles:
  923. historical_titles[source_id] = set()
  924. for title in titles_data.keys():
  925. historical_titles[source_id].add(title)
  926. # 找出新增标题
  927. new_titles = {}
  928. for source_id, latest_source_titles in latest_titles.items():
  929. historical_set = historical_titles.get(source_id, set())
  930. source_new_titles = {}
  931. for title, title_data in latest_source_titles.items():
  932. if title not in historical_set:
  933. source_new_titles[title] = title_data
  934. if source_new_titles:
  935. new_titles[source_id] = source_new_titles
  936. return new_titles
  937. # === 统计和分析 ===
  938. def calculate_news_weight(
  939. title_data: Dict, rank_threshold: int = CONFIG["RANK_THRESHOLD"]
  940. ) -> float:
  941. """计算新闻权重,用于排序"""
  942. ranks = title_data.get("ranks", [])
  943. if not ranks:
  944. return 0.0
  945. count = title_data.get("count", len(ranks))
  946. weight_config = CONFIG["WEIGHT_CONFIG"]
  947. # 排名权重:Σ(11 - min(rank, 10)) / 出现次数
  948. rank_scores = []
  949. for rank in ranks:
  950. score = 11 - min(rank, 10)
  951. rank_scores.append(score)
  952. rank_weight = sum(rank_scores) / len(ranks) if ranks else 0
  953. # 频次权重:min(出现次数, 10) × 10
  954. frequency_weight = min(count, 10) * 10
  955. # 热度加成:高排名次数 / 总出现次数 × 100
  956. high_rank_count = sum(1 for rank in ranks if rank <= rank_threshold)
  957. hotness_ratio = high_rank_count / len(ranks) if ranks else 0
  958. hotness_weight = hotness_ratio * 100
  959. total_weight = (
  960. rank_weight * weight_config["RANK_WEIGHT"]
  961. + frequency_weight * weight_config["FREQUENCY_WEIGHT"]
  962. + hotness_weight * weight_config["HOTNESS_WEIGHT"]
  963. )
  964. return total_weight
  965. def matches_word_groups(
  966. title: str, word_groups: List[Dict], filter_words: List[str], global_filters: Optional[List[str]] = None
  967. ) -> bool:
  968. """检查标题是否匹配词组规则"""
  969. # 防御性类型检查:确保 title 是有效字符串
  970. if not isinstance(title, str):
  971. title = str(title) if title is not None else ""
  972. if not title.strip():
  973. return False
  974. title_lower = title.lower()
  975. # 全局过滤检查(优先级最高)
  976. if global_filters:
  977. if any(global_word.lower() in title_lower for global_word in global_filters):
  978. return False
  979. # 如果没有配置词组,则匹配所有标题(支持显示全部新闻)
  980. if not word_groups:
  981. return True
  982. # 过滤词检查
  983. if any(filter_word.lower() in title_lower for filter_word in filter_words):
  984. return False
  985. # 词组匹配检查
  986. for group in word_groups:
  987. required_words = group["required"]
  988. normal_words = group["normal"]
  989. # 必须词检查
  990. if required_words:
  991. all_required_present = all(
  992. req_word.lower() in title_lower for req_word in required_words
  993. )
  994. if not all_required_present:
  995. continue
  996. # 普通词检查
  997. if normal_words:
  998. any_normal_present = any(
  999. normal_word.lower() in title_lower for normal_word in normal_words
  1000. )
  1001. if not any_normal_present:
  1002. continue
  1003. return True
  1004. return False
  1005. def format_time_display(first_time: str, last_time: str) -> str:
  1006. """格式化时间显示"""
  1007. if not first_time:
  1008. return ""
  1009. if first_time == last_time or not last_time:
  1010. return first_time
  1011. else:
  1012. return f"[{first_time} ~ {last_time}]"
  1013. def format_rank_display(ranks: List[int], rank_threshold: int, format_type: str) -> str:
  1014. """统一的排名格式化方法"""
  1015. if not ranks:
  1016. return ""
  1017. unique_ranks = sorted(set(ranks))
  1018. min_rank = unique_ranks[0]
  1019. max_rank = unique_ranks[-1]
  1020. if format_type == "html":
  1021. highlight_start = "<font color='red'><strong>"
  1022. highlight_end = "</strong></font>"
  1023. elif format_type == "feishu":
  1024. highlight_start = "<font color='red'>**"
  1025. highlight_end = "**</font>"
  1026. elif format_type == "dingtalk":
  1027. highlight_start = "**"
  1028. highlight_end = "**"
  1029. elif format_type == "wework":
  1030. highlight_start = "**"
  1031. highlight_end = "**"
  1032. elif format_type == "telegram":
  1033. highlight_start = "<b>"
  1034. highlight_end = "</b>"
  1035. elif format_type == "slack":
  1036. highlight_start = "*"
  1037. highlight_end = "*"
  1038. else:
  1039. highlight_start = "**"
  1040. highlight_end = "**"
  1041. if min_rank <= rank_threshold:
  1042. if min_rank == max_rank:
  1043. return f"{highlight_start}[{min_rank}]{highlight_end}"
  1044. else:
  1045. return f"{highlight_start}[{min_rank} - {max_rank}]{highlight_end}"
  1046. else:
  1047. if min_rank == max_rank:
  1048. return f"[{min_rank}]"
  1049. else:
  1050. return f"[{min_rank} - {max_rank}]"
  1051. def count_word_frequency(
  1052. results: Dict,
  1053. word_groups: List[Dict],
  1054. filter_words: List[str],
  1055. id_to_name: Dict,
  1056. title_info: Optional[Dict] = None,
  1057. rank_threshold: int = CONFIG["RANK_THRESHOLD"],
  1058. new_titles: Optional[Dict] = None,
  1059. mode: str = "daily",
  1060. global_filters: Optional[List[str]] = None,
  1061. ) -> Tuple[List[Dict], int]:
  1062. """统计词频,支持必须词、频率词、过滤词、全局过滤词,并标记新增标题"""
  1063. # 如果没有配置词组,创建一个包含所有新闻的虚拟词组
  1064. if not word_groups:
  1065. print("频率词配置为空,将显示所有新闻")
  1066. word_groups = [{"required": [], "normal": [], "group_key": "全部新闻"}]
  1067. filter_words = [] # 清空过滤词,显示所有新闻
  1068. is_first_today = is_first_crawl_today()
  1069. # 确定处理的数据源和新增标记逻辑
  1070. if mode == "incremental":
  1071. if is_first_today:
  1072. # 增量模式 + 当天第一次:处理所有新闻,都标记为新增
  1073. results_to_process = results
  1074. all_news_are_new = True
  1075. else:
  1076. # 增量模式 + 当天非第一次:只处理新增的新闻
  1077. results_to_process = new_titles if new_titles else {}
  1078. all_news_are_new = True
  1079. elif mode == "current":
  1080. # current 模式:只处理当前时间批次的新闻,但统计信息来自全部历史
  1081. if title_info:
  1082. latest_time = None
  1083. for source_titles in title_info.values():
  1084. for title_data in source_titles.values():
  1085. last_time = title_data.get("last_time", "")
  1086. if last_time:
  1087. if latest_time is None or last_time > latest_time:
  1088. latest_time = last_time
  1089. # 只处理 last_time 等于最新时间的新闻
  1090. if latest_time:
  1091. results_to_process = {}
  1092. for source_id, source_titles in results.items():
  1093. if source_id in title_info:
  1094. filtered_titles = {}
  1095. for title, title_data in source_titles.items():
  1096. if title in title_info[source_id]:
  1097. info = title_info[source_id][title]
  1098. if info.get("last_time") == latest_time:
  1099. filtered_titles[title] = title_data
  1100. if filtered_titles:
  1101. results_to_process[source_id] = filtered_titles
  1102. print(
  1103. f"当前榜单模式:最新时间 {latest_time},筛选出 {sum(len(titles) for titles in results_to_process.values())} 条当前榜单新闻"
  1104. )
  1105. else:
  1106. results_to_process = results
  1107. else:
  1108. results_to_process = results
  1109. all_news_are_new = False
  1110. else:
  1111. # 当日汇总模式:处理所有新闻
  1112. results_to_process = results
  1113. all_news_are_new = False
  1114. total_input_news = sum(len(titles) for titles in results.values())
  1115. filter_status = (
  1116. "全部显示"
  1117. if len(word_groups) == 1 and word_groups[0]["group_key"] == "全部新闻"
  1118. else "频率词过滤"
  1119. )
  1120. print(f"当日汇总模式:处理 {total_input_news} 条新闻,模式:{filter_status}")
  1121. word_stats = {}
  1122. total_titles = 0
  1123. processed_titles = {}
  1124. matched_new_count = 0
  1125. if title_info is None:
  1126. title_info = {}
  1127. if new_titles is None:
  1128. new_titles = {}
  1129. for group in word_groups:
  1130. group_key = group["group_key"]
  1131. word_stats[group_key] = {"count": 0, "titles": {}}
  1132. for source_id, titles_data in results_to_process.items():
  1133. total_titles += len(titles_data)
  1134. if source_id not in processed_titles:
  1135. processed_titles[source_id] = {}
  1136. for title, title_data in titles_data.items():
  1137. if title in processed_titles.get(source_id, {}):
  1138. continue
  1139. # 使用统一的匹配逻辑
  1140. matches_frequency_words = matches_word_groups(
  1141. title, word_groups, filter_words, global_filters
  1142. )
  1143. if not matches_frequency_words:
  1144. continue
  1145. # 如果是增量模式或 current 模式第一次,统计匹配的新增新闻数量
  1146. if (mode == "incremental" and all_news_are_new) or (
  1147. mode == "current" and is_first_today
  1148. ):
  1149. matched_new_count += 1
  1150. source_ranks = title_data.get("ranks", [])
  1151. source_url = title_data.get("url", "")
  1152. source_mobile_url = title_data.get("mobileUrl", "")
  1153. # 找到匹配的词组(防御性转换确保类型安全)
  1154. title_lower = str(title).lower() if not isinstance(title, str) else title.lower()
  1155. for group in word_groups:
  1156. required_words = group["required"]
  1157. normal_words = group["normal"]
  1158. # 如果是"全部新闻"模式,所有标题都匹配第一个(唯一的)词组
  1159. if len(word_groups) == 1 and word_groups[0]["group_key"] == "全部新闻":
  1160. group_key = group["group_key"]
  1161. word_stats[group_key]["count"] += 1
  1162. if source_id not in word_stats[group_key]["titles"]:
  1163. word_stats[group_key]["titles"][source_id] = []
  1164. else:
  1165. # 原有的匹配逻辑
  1166. if required_words:
  1167. all_required_present = all(
  1168. req_word.lower() in title_lower
  1169. for req_word in required_words
  1170. )
  1171. if not all_required_present:
  1172. continue
  1173. if normal_words:
  1174. any_normal_present = any(
  1175. normal_word.lower() in title_lower
  1176. for normal_word in normal_words
  1177. )
  1178. if not any_normal_present:
  1179. continue
  1180. group_key = group["group_key"]
  1181. word_stats[group_key]["count"] += 1
  1182. if source_id not in word_stats[group_key]["titles"]:
  1183. word_stats[group_key]["titles"][source_id] = []
  1184. first_time = ""
  1185. last_time = ""
  1186. count_info = 1
  1187. ranks = source_ranks if source_ranks else []
  1188. url = source_url
  1189. mobile_url = source_mobile_url
  1190. # 对于 current 模式,从历史统计信息中获取完整数据
  1191. if (
  1192. mode == "current"
  1193. and title_info
  1194. and source_id in title_info
  1195. and title in title_info[source_id]
  1196. ):
  1197. info = title_info[source_id][title]
  1198. first_time = info.get("first_time", "")
  1199. last_time = info.get("last_time", "")
  1200. count_info = info.get("count", 1)
  1201. if "ranks" in info and info["ranks"]:
  1202. ranks = info["ranks"]
  1203. url = info.get("url", source_url)
  1204. mobile_url = info.get("mobileUrl", source_mobile_url)
  1205. elif (
  1206. title_info
  1207. and source_id in title_info
  1208. and title in title_info[source_id]
  1209. ):
  1210. info = title_info[source_id][title]
  1211. first_time = info.get("first_time", "")
  1212. last_time = info.get("last_time", "")
  1213. count_info = info.get("count", 1)
  1214. if "ranks" in info and info["ranks"]:
  1215. ranks = info["ranks"]
  1216. url = info.get("url", source_url)
  1217. mobile_url = info.get("mobileUrl", source_mobile_url)
  1218. if not ranks:
  1219. ranks = [99]
  1220. time_display = format_time_display(first_time, last_time)
  1221. source_name = id_to_name.get(source_id, source_id)
  1222. # 判断是否为新增
  1223. is_new = False
  1224. if all_news_are_new:
  1225. # 增量模式下所有处理的新闻都是新增,或者当天第一次的所有新闻都是新增
  1226. is_new = True
  1227. elif new_titles and source_id in new_titles:
  1228. # 检查是否在新增列表中
  1229. new_titles_for_source = new_titles[source_id]
  1230. is_new = title in new_titles_for_source
  1231. word_stats[group_key]["titles"][source_id].append(
  1232. {
  1233. "title": title,
  1234. "source_name": source_name,
  1235. "first_time": first_time,
  1236. "last_time": last_time,
  1237. "time_display": time_display,
  1238. "count": count_info,
  1239. "ranks": ranks,
  1240. "rank_threshold": rank_threshold,
  1241. "url": url,
  1242. "mobileUrl": mobile_url,
  1243. "is_new": is_new,
  1244. }
  1245. )
  1246. if source_id not in processed_titles:
  1247. processed_titles[source_id] = {}
  1248. processed_titles[source_id][title] = True
  1249. break
  1250. # 最后统一打印汇总信息
  1251. if mode == "incremental":
  1252. if is_first_today:
  1253. total_input_news = sum(len(titles) for titles in results.values())
  1254. filter_status = (
  1255. "全部显示"
  1256. if len(word_groups) == 1 and word_groups[0]["group_key"] == "全部新闻"
  1257. else "频率词匹配"
  1258. )
  1259. print(
  1260. f"增量模式:当天第一次爬取,{total_input_news} 条新闻中有 {matched_new_count} 条{filter_status}"
  1261. )
  1262. else:
  1263. if new_titles:
  1264. total_new_count = sum(len(titles) for titles in new_titles.values())
  1265. filter_status = (
  1266. "全部显示"
  1267. if len(word_groups) == 1
  1268. and word_groups[0]["group_key"] == "全部新闻"
  1269. else "匹配频率词"
  1270. )
  1271. print(
  1272. f"增量模式:{total_new_count} 条新增新闻中,有 {matched_new_count} 条{filter_status}"
  1273. )
  1274. if matched_new_count == 0 and len(word_groups) > 1:
  1275. print("增量模式:没有新增新闻匹配频率词,将不会发送通知")
  1276. else:
  1277. print("增量模式:未检测到新增新闻")
  1278. elif mode == "current":
  1279. total_input_news = sum(len(titles) for titles in results_to_process.values())
  1280. if is_first_today:
  1281. filter_status = (
  1282. "全部显示"
  1283. if len(word_groups) == 1 and word_groups[0]["group_key"] == "全部新闻"
  1284. else "频率词匹配"
  1285. )
  1286. print(
  1287. f"当前榜单模式:当天第一次爬取,{total_input_news} 条当前榜单新闻中有 {matched_new_count} 条{filter_status}"
  1288. )
  1289. else:
  1290. matched_count = sum(stat["count"] for stat in word_stats.values())
  1291. filter_status = (
  1292. "全部显示"
  1293. if len(word_groups) == 1 and word_groups[0]["group_key"] == "全部新闻"
  1294. else "频率词匹配"
  1295. )
  1296. print(
  1297. f"当前榜单模式:{total_input_news} 条当前榜单新闻中有 {matched_count} 条{filter_status}"
  1298. )
  1299. stats = []
  1300. # 创建 group_key 到位置和最大数量的映射
  1301. group_key_to_position = {
  1302. group["group_key"]: idx for idx, group in enumerate(word_groups)
  1303. }
  1304. group_key_to_max_count = {
  1305. group["group_key"]: group.get("max_count", 0) for group in word_groups
  1306. }
  1307. for group_key, data in word_stats.items():
  1308. all_titles = []
  1309. for source_id, title_list in data["titles"].items():
  1310. all_titles.extend(title_list)
  1311. # 按权重排序
  1312. sorted_titles = sorted(
  1313. all_titles,
  1314. key=lambda x: (
  1315. -calculate_news_weight(x, rank_threshold),
  1316. min(x["ranks"]) if x["ranks"] else 999,
  1317. -x["count"],
  1318. ),
  1319. )
  1320. # 应用最大显示数量限制(优先级:单独配置 > 全局配置)
  1321. group_max_count = group_key_to_max_count.get(group_key, 0)
  1322. if group_max_count == 0:
  1323. # 使用全局配置
  1324. group_max_count = CONFIG.get("MAX_NEWS_PER_KEYWORD", 0)
  1325. if group_max_count > 0:
  1326. sorted_titles = sorted_titles[:group_max_count]
  1327. stats.append(
  1328. {
  1329. "word": group_key,
  1330. "count": data["count"],
  1331. "position": group_key_to_position.get(group_key, 999),
  1332. "titles": sorted_titles,
  1333. "percentage": (
  1334. round(data["count"] / total_titles * 100, 2)
  1335. if total_titles > 0
  1336. else 0
  1337. ),
  1338. }
  1339. )
  1340. # 根据配置选择排序优先级
  1341. if CONFIG.get("SORT_BY_POSITION_FIRST", False):
  1342. # 先按配置位置,再按热点条数
  1343. stats.sort(key=lambda x: (x["position"], -x["count"]))
  1344. else:
  1345. # 先按热点条数,再按配置位置(原逻辑)
  1346. stats.sort(key=lambda x: (-x["count"], x["position"]))
  1347. return stats, total_titles
  1348. # === 报告生成 ===
  1349. def prepare_report_data(
  1350. stats: List[Dict],
  1351. failed_ids: Optional[List] = None,
  1352. new_titles: Optional[Dict] = None,
  1353. id_to_name: Optional[Dict] = None,
  1354. mode: str = "daily",
  1355. ) -> Dict:
  1356. """准备报告数据"""
  1357. processed_new_titles = []
  1358. # 在增量模式下隐藏新增新闻区域
  1359. hide_new_section = mode == "incremental"
  1360. # 只有在非隐藏模式下才处理新增新闻部分
  1361. if not hide_new_section:
  1362. filtered_new_titles = {}
  1363. if new_titles and id_to_name:
  1364. word_groups, filter_words, global_filters = load_frequency_words()
  1365. for source_id, titles_data in new_titles.items():
  1366. filtered_titles = {}
  1367. for title, title_data in titles_data.items():
  1368. if matches_word_groups(title, word_groups, filter_words, global_filters):
  1369. filtered_titles[title] = title_data
  1370. if filtered_titles:
  1371. filtered_new_titles[source_id] = filtered_titles
  1372. if filtered_new_titles and id_to_name:
  1373. for source_id, titles_data in filtered_new_titles.items():
  1374. source_name = id_to_name.get(source_id, source_id)
  1375. source_titles = []
  1376. for title, title_data in titles_data.items():
  1377. url = title_data.get("url", "")
  1378. mobile_url = title_data.get("mobileUrl", "")
  1379. ranks = title_data.get("ranks", [])
  1380. processed_title = {
  1381. "title": title,
  1382. "source_name": source_name,
  1383. "time_display": "",
  1384. "count": 1,
  1385. "ranks": ranks,
  1386. "rank_threshold": CONFIG["RANK_THRESHOLD"],
  1387. "url": url,
  1388. "mobile_url": mobile_url,
  1389. "is_new": True,
  1390. }
  1391. source_titles.append(processed_title)
  1392. if source_titles:
  1393. processed_new_titles.append(
  1394. {
  1395. "source_id": source_id,
  1396. "source_name": source_name,
  1397. "titles": source_titles,
  1398. }
  1399. )
  1400. processed_stats = []
  1401. for stat in stats:
  1402. if stat["count"] <= 0:
  1403. continue
  1404. processed_titles = []
  1405. for title_data in stat["titles"]:
  1406. processed_title = {
  1407. "title": title_data["title"],
  1408. "source_name": title_data["source_name"],
  1409. "time_display": title_data["time_display"],
  1410. "count": title_data["count"],
  1411. "ranks": title_data["ranks"],
  1412. "rank_threshold": title_data["rank_threshold"],
  1413. "url": title_data.get("url", ""),
  1414. "mobile_url": title_data.get("mobileUrl", ""),
  1415. "is_new": title_data.get("is_new", False),
  1416. }
  1417. processed_titles.append(processed_title)
  1418. processed_stats.append(
  1419. {
  1420. "word": stat["word"],
  1421. "count": stat["count"],
  1422. "percentage": stat.get("percentage", 0),
  1423. "titles": processed_titles,
  1424. }
  1425. )
  1426. return {
  1427. "stats": processed_stats,
  1428. "new_titles": processed_new_titles,
  1429. "failed_ids": failed_ids or [],
  1430. "total_new_count": sum(
  1431. len(source["titles"]) for source in processed_new_titles
  1432. ),
  1433. }
  1434. def format_title_for_platform(
  1435. platform: str, title_data: Dict, show_source: bool = True
  1436. ) -> str:
  1437. """统一的标题格式化方法"""
  1438. rank_display = format_rank_display(
  1439. title_data["ranks"], title_data["rank_threshold"], platform
  1440. )
  1441. link_url = title_data["mobile_url"] or title_data["url"]
  1442. cleaned_title = clean_title(title_data["title"])
  1443. if platform == "feishu":
  1444. if link_url:
  1445. formatted_title = f"[{cleaned_title}]({link_url})"
  1446. else:
  1447. formatted_title = cleaned_title
  1448. title_prefix = "🆕 " if title_data.get("is_new") else ""
  1449. if show_source:
  1450. result = f"<font color='grey'>[{title_data['source_name']}]</font> {title_prefix}{formatted_title}"
  1451. else:
  1452. result = f"{title_prefix}{formatted_title}"
  1453. if rank_display:
  1454. result += f" {rank_display}"
  1455. if title_data["time_display"]:
  1456. result += f" <font color='grey'>- {title_data['time_display']}</font>"
  1457. if title_data["count"] > 1:
  1458. result += f" <font color='green'>({title_data['count']}次)</font>"
  1459. return result
  1460. elif platform == "dingtalk":
  1461. if link_url:
  1462. formatted_title = f"[{cleaned_title}]({link_url})"
  1463. else:
  1464. formatted_title = cleaned_title
  1465. title_prefix = "🆕 " if title_data.get("is_new") else ""
  1466. if show_source:
  1467. result = f"[{title_data['source_name']}] {title_prefix}{formatted_title}"
  1468. else:
  1469. result = f"{title_prefix}{formatted_title}"
  1470. if rank_display:
  1471. result += f" {rank_display}"
  1472. if title_data["time_display"]:
  1473. result += f" - {title_data['time_display']}"
  1474. if title_data["count"] > 1:
  1475. result += f" ({title_data['count']}次)"
  1476. return result
  1477. elif platform in ("wework", "bark"):
  1478. # WeWork 和 Bark 使用 markdown 格式
  1479. if link_url:
  1480. formatted_title = f"[{cleaned_title}]({link_url})"
  1481. else:
  1482. formatted_title = cleaned_title
  1483. title_prefix = "🆕 " if title_data.get("is_new") else ""
  1484. if show_source:
  1485. result = f"[{title_data['source_name']}] {title_prefix}{formatted_title}"
  1486. else:
  1487. result = f"{title_prefix}{formatted_title}"
  1488. if rank_display:
  1489. result += f" {rank_display}"
  1490. if title_data["time_display"]:
  1491. result += f" - {title_data['time_display']}"
  1492. if title_data["count"] > 1:
  1493. result += f" ({title_data['count']}次)"
  1494. return result
  1495. elif platform == "telegram":
  1496. if link_url:
  1497. formatted_title = f'<a href="{link_url}">{html_escape(cleaned_title)}</a>'
  1498. else:
  1499. formatted_title = cleaned_title
  1500. title_prefix = "🆕 " if title_data.get("is_new") else ""
  1501. if show_source:
  1502. result = f"[{title_data['source_name']}] {title_prefix}{formatted_title}"
  1503. else:
  1504. result = f"{title_prefix}{formatted_title}"
  1505. if rank_display:
  1506. result += f" {rank_display}"
  1507. if title_data["time_display"]:
  1508. result += f" <code>- {title_data['time_display']}</code>"
  1509. if title_data["count"] > 1:
  1510. result += f" <code>({title_data['count']}次)</code>"
  1511. return result
  1512. elif platform == "ntfy":
  1513. if link_url:
  1514. formatted_title = f"[{cleaned_title}]({link_url})"
  1515. else:
  1516. formatted_title = cleaned_title
  1517. title_prefix = "🆕 " if title_data.get("is_new") else ""
  1518. if show_source:
  1519. result = f"[{title_data['source_name']}] {title_prefix}{formatted_title}"
  1520. else:
  1521. result = f"{title_prefix}{formatted_title}"
  1522. if rank_display:
  1523. result += f" {rank_display}"
  1524. if title_data["time_display"]:
  1525. result += f" `- {title_data['time_display']}`"
  1526. if title_data["count"] > 1:
  1527. result += f" `({title_data['count']}次)`"
  1528. return result
  1529. elif platform == "slack":
  1530. # Slack 使用 mrkdwn 格式
  1531. if link_url:
  1532. # Slack 链接格式: <url|text>
  1533. formatted_title = f"<{link_url}|{cleaned_title}>"
  1534. else:
  1535. formatted_title = cleaned_title
  1536. title_prefix = "🆕 " if title_data.get("is_new") else ""
  1537. if show_source:
  1538. result = f"[{title_data['source_name']}] {title_prefix}{formatted_title}"
  1539. else:
  1540. result = f"{title_prefix}{formatted_title}"
  1541. # 排名(使用 * 加粗)
  1542. rank_display = format_rank_display(
  1543. title_data["ranks"], title_data["rank_threshold"], "slack"
  1544. )
  1545. if rank_display:
  1546. result += f" {rank_display}"
  1547. if title_data["time_display"]:
  1548. result += f" `- {title_data['time_display']}`"
  1549. if title_data["count"] > 1:
  1550. result += f" `({title_data['count']}次)`"
  1551. return result
  1552. elif platform == "html":
  1553. rank_display = format_rank_display(
  1554. title_data["ranks"], title_data["rank_threshold"], "html"
  1555. )
  1556. link_url = title_data["mobile_url"] or title_data["url"]
  1557. escaped_title = html_escape(cleaned_title)
  1558. escaped_source_name = html_escape(title_data["source_name"])
  1559. if link_url:
  1560. escaped_url = html_escape(link_url)
  1561. formatted_title = f'[{escaped_source_name}] <a href="{escaped_url}" target="_blank" class="news-link">{escaped_title}</a>'
  1562. else:
  1563. formatted_title = (
  1564. f'[{escaped_source_name}] <span class="no-link">{escaped_title}</span>'
  1565. )
  1566. if rank_display:
  1567. formatted_title += f" {rank_display}"
  1568. if title_data["time_display"]:
  1569. escaped_time = html_escape(title_data["time_display"])
  1570. formatted_title += f" <font color='grey'>- {escaped_time}</font>"
  1571. if title_data["count"] > 1:
  1572. formatted_title += f" <font color='green'>({title_data['count']}次)</font>"
  1573. if title_data.get("is_new"):
  1574. formatted_title = f"<div class='new-title'>🆕 {formatted_title}</div>"
  1575. return formatted_title
  1576. else:
  1577. return cleaned_title
  1578. def generate_html_report(
  1579. stats: List[Dict],
  1580. total_titles: int,
  1581. failed_ids: Optional[List] = None,
  1582. new_titles: Optional[Dict] = None,
  1583. id_to_name: Optional[Dict] = None,
  1584. mode: str = "daily",
  1585. is_daily_summary: bool = False,
  1586. update_info: Optional[Dict] = None,
  1587. ) -> str:
  1588. """生成HTML报告"""
  1589. if is_daily_summary:
  1590. if mode == "current":
  1591. filename = "当前榜单汇总.html"
  1592. elif mode == "incremental":
  1593. filename = "当日增量.html"
  1594. else:
  1595. filename = "当日汇总.html"
  1596. else:
  1597. filename = f"{format_time_filename()}.html"
  1598. file_path = get_output_path("html", filename)
  1599. report_data = prepare_report_data(stats, failed_ids, new_titles, id_to_name, mode)
  1600. html_content = render_html_content(
  1601. report_data, total_titles, is_daily_summary, mode, update_info
  1602. )
  1603. with open(file_path, "w", encoding="utf-8") as f:
  1604. f.write(html_content)
  1605. if is_daily_summary:
  1606. # 生成到根目录(供 GitHub Pages 访问)
  1607. root_index_path = Path("index.html")
  1608. with open(root_index_path, "w", encoding="utf-8") as f:
  1609. f.write(html_content)
  1610. # 同时生成到 output 目录(供 Docker Volume 挂载访问)
  1611. output_index_path = Path("output") / "index.html"
  1612. ensure_directory_exists("output")
  1613. with open(output_index_path, "w", encoding="utf-8") as f:
  1614. f.write(html_content)
  1615. return file_path
  1616. def render_html_content(
  1617. report_data: Dict,
  1618. total_titles: int,
  1619. is_daily_summary: bool = False,
  1620. mode: str = "daily",
  1621. update_info: Optional[Dict] = None,
  1622. ) -> str:
  1623. """渲染HTML内容"""
  1624. html = """
  1625. <!DOCTYPE html>
  1626. <html>
  1627. <head>
  1628. <meta charset="UTF-8">
  1629. <meta name="viewport" content="width=device-width, initial-scale=1.0">
  1630. <title>热点新闻分析</title>
  1631. <script src="https://cdnjs.cloudflare.com/ajax/libs/html2canvas/1.4.1/html2canvas.min.js" integrity="sha512-BNaRQnYJYiPSqHHDb58B0yaPfCu+Wgds8Gp/gU33kqBtgNS4tSPHuGibyoeqMV/TJlSKda6FXzoEyYGjTe+vXA==" crossorigin="anonymous" referrerpolicy="no-referrer"></script>
  1632. <style>
  1633. * { box-sizing: border-box; }
  1634. body {
  1635. font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', system-ui, sans-serif;
  1636. margin: 0;
  1637. padding: 16px;
  1638. background: #fafafa;
  1639. color: #333;
  1640. line-height: 1.5;
  1641. }
  1642. .container {
  1643. max-width: 600px;
  1644. margin: 0 auto;
  1645. background: white;
  1646. border-radius: 12px;
  1647. overflow: hidden;
  1648. box-shadow: 0 2px 16px rgba(0,0,0,0.06);
  1649. }
  1650. .header {
  1651. background: linear-gradient(135deg, #4f46e5 0%, #7c3aed 100%);
  1652. color: white;
  1653. padding: 32px 24px;
  1654. text-align: center;
  1655. position: relative;
  1656. }
  1657. .save-buttons {
  1658. position: absolute;
  1659. top: 16px;
  1660. right: 16px;
  1661. display: flex;
  1662. gap: 8px;
  1663. }
  1664. .save-btn {
  1665. background: rgba(255, 255, 255, 0.2);
  1666. border: 1px solid rgba(255, 255, 255, 0.3);
  1667. color: white;
  1668. padding: 8px 16px;
  1669. border-radius: 6px;
  1670. cursor: pointer;
  1671. font-size: 13px;
  1672. font-weight: 500;
  1673. transition: all 0.2s ease;
  1674. backdrop-filter: blur(10px);
  1675. white-space: nowrap;
  1676. }
  1677. .save-btn:hover {
  1678. background: rgba(255, 255, 255, 0.3);
  1679. border-color: rgba(255, 255, 255, 0.5);
  1680. transform: translateY(-1px);
  1681. }
  1682. .save-btn:active {
  1683. transform: translateY(0);
  1684. }
  1685. .save-btn:disabled {
  1686. opacity: 0.6;
  1687. cursor: not-allowed;
  1688. }
  1689. .header-title {
  1690. font-size: 22px;
  1691. font-weight: 700;
  1692. margin: 0 0 20px 0;
  1693. }
  1694. .header-info {
  1695. display: grid;
  1696. grid-template-columns: 1fr 1fr;
  1697. gap: 16px;
  1698. font-size: 14px;
  1699. opacity: 0.95;
  1700. }
  1701. .info-item {
  1702. text-align: center;
  1703. }
  1704. .info-label {
  1705. display: block;
  1706. font-size: 12px;
  1707. opacity: 0.8;
  1708. margin-bottom: 4px;
  1709. }
  1710. .info-value {
  1711. font-weight: 600;
  1712. font-size: 16px;
  1713. }
  1714. .content {
  1715. padding: 24px;
  1716. }
  1717. .word-group {
  1718. margin-bottom: 40px;
  1719. }
  1720. .word-group:first-child {
  1721. margin-top: 0;
  1722. }
  1723. .word-header {
  1724. display: flex;
  1725. align-items: center;
  1726. justify-content: space-between;
  1727. margin-bottom: 20px;
  1728. padding-bottom: 8px;
  1729. border-bottom: 1px solid #f0f0f0;
  1730. }
  1731. .word-info {
  1732. display: flex;
  1733. align-items: center;
  1734. gap: 12px;
  1735. }
  1736. .word-name {
  1737. font-size: 17px;
  1738. font-weight: 600;
  1739. color: #1a1a1a;
  1740. }
  1741. .word-count {
  1742. color: #666;
  1743. font-size: 13px;
  1744. font-weight: 500;
  1745. }
  1746. .word-count.hot { color: #dc2626; font-weight: 600; }
  1747. .word-count.warm { color: #ea580c; font-weight: 600; }
  1748. .word-index {
  1749. color: #999;
  1750. font-size: 12px;
  1751. }
  1752. .news-item {
  1753. margin-bottom: 20px;
  1754. padding: 16px 0;
  1755. border-bottom: 1px solid #f5f5f5;
  1756. position: relative;
  1757. display: flex;
  1758. gap: 12px;
  1759. align-items: center;
  1760. }
  1761. .news-item:last-child {
  1762. border-bottom: none;
  1763. }
  1764. .news-item.new::after {
  1765. content: "NEW";
  1766. position: absolute;
  1767. top: 12px;
  1768. right: 0;
  1769. background: #fbbf24;
  1770. color: #92400e;
  1771. font-size: 9px;
  1772. font-weight: 700;
  1773. padding: 3px 6px;
  1774. border-radius: 4px;
  1775. letter-spacing: 0.5px;
  1776. }
  1777. .news-number {
  1778. color: #999;
  1779. font-size: 13px;
  1780. font-weight: 600;
  1781. min-width: 20px;
  1782. text-align: center;
  1783. flex-shrink: 0;
  1784. background: #f8f9fa;
  1785. border-radius: 50%;
  1786. width: 24px;
  1787. height: 24px;
  1788. display: flex;
  1789. align-items: center;
  1790. justify-content: center;
  1791. align-self: flex-start;
  1792. margin-top: 8px;
  1793. }
  1794. .news-content {
  1795. flex: 1;
  1796. min-width: 0;
  1797. padding-right: 40px;
  1798. }
  1799. .news-item.new .news-content {
  1800. padding-right: 50px;
  1801. }
  1802. .news-header {
  1803. display: flex;
  1804. align-items: center;
  1805. gap: 8px;
  1806. margin-bottom: 8px;
  1807. flex-wrap: wrap;
  1808. }
  1809. .source-name {
  1810. color: #666;
  1811. font-size: 12px;
  1812. font-weight: 500;
  1813. }
  1814. .rank-num {
  1815. color: #fff;
  1816. background: #6b7280;
  1817. font-size: 10px;
  1818. font-weight: 700;
  1819. padding: 2px 6px;
  1820. border-radius: 10px;
  1821. min-width: 18px;
  1822. text-align: center;
  1823. }
  1824. .rank-num.top { background: #dc2626; }
  1825. .rank-num.high { background: #ea580c; }
  1826. .time-info {
  1827. color: #999;
  1828. font-size: 11px;
  1829. }
  1830. .count-info {
  1831. color: #059669;
  1832. font-size: 11px;
  1833. font-weight: 500;
  1834. }
  1835. .news-title {
  1836. font-size: 15px;
  1837. line-height: 1.4;
  1838. color: #1a1a1a;
  1839. margin: 0;
  1840. }
  1841. .news-link {
  1842. color: #2563eb;
  1843. text-decoration: none;
  1844. }
  1845. .news-link:hover {
  1846. text-decoration: underline;
  1847. }
  1848. .news-link:visited {
  1849. color: #7c3aed;
  1850. }
  1851. .new-section {
  1852. margin-top: 40px;
  1853. padding-top: 24px;
  1854. border-top: 2px solid #f0f0f0;
  1855. }
  1856. .new-section-title {
  1857. color: #1a1a1a;
  1858. font-size: 16px;
  1859. font-weight: 600;
  1860. margin: 0 0 20px 0;
  1861. }
  1862. .new-source-group {
  1863. margin-bottom: 24px;
  1864. }
  1865. .new-source-title {
  1866. color: #666;
  1867. font-size: 13px;
  1868. font-weight: 500;
  1869. margin: 0 0 12px 0;
  1870. padding-bottom: 6px;
  1871. border-bottom: 1px solid #f5f5f5;
  1872. }
  1873. .new-item {
  1874. display: flex;
  1875. align-items: center;
  1876. gap: 12px;
  1877. padding: 8px 0;
  1878. border-bottom: 1px solid #f9f9f9;
  1879. }
  1880. .new-item:last-child {
  1881. border-bottom: none;
  1882. }
  1883. .new-item-number {
  1884. color: #999;
  1885. font-size: 12px;
  1886. font-weight: 600;
  1887. min-width: 18px;
  1888. text-align: center;
  1889. flex-shrink: 0;
  1890. background: #f8f9fa;
  1891. border-radius: 50%;
  1892. width: 20px;
  1893. height: 20px;
  1894. display: flex;
  1895. align-items: center;
  1896. justify-content: center;
  1897. }
  1898. .new-item-rank {
  1899. color: #fff;
  1900. background: #6b7280;
  1901. font-size: 10px;
  1902. font-weight: 700;
  1903. padding: 3px 6px;
  1904. border-radius: 8px;
  1905. min-width: 20px;
  1906. text-align: center;
  1907. flex-shrink: 0;
  1908. }
  1909. .new-item-rank.top { background: #dc2626; }
  1910. .new-item-rank.high { background: #ea580c; }
  1911. .new-item-content {
  1912. flex: 1;
  1913. min-width: 0;
  1914. }
  1915. .new-item-title {
  1916. font-size: 14px;
  1917. line-height: 1.4;
  1918. color: #1a1a1a;
  1919. margin: 0;
  1920. }
  1921. .error-section {
  1922. background: #fef2f2;
  1923. border: 1px solid #fecaca;
  1924. border-radius: 8px;
  1925. padding: 16px;
  1926. margin-bottom: 24px;
  1927. }
  1928. .error-title {
  1929. color: #dc2626;
  1930. font-size: 14px;
  1931. font-weight: 600;
  1932. margin: 0 0 8px 0;
  1933. }
  1934. .error-list {
  1935. list-style: none;
  1936. padding: 0;
  1937. margin: 0;
  1938. }
  1939. .error-item {
  1940. color: #991b1b;
  1941. font-size: 13px;
  1942. padding: 2px 0;
  1943. font-family: 'SF Mono', Consolas, monospace;
  1944. }
  1945. .footer {
  1946. margin-top: 32px;
  1947. padding: 20px 24px;
  1948. background: #f8f9fa;
  1949. border-top: 1px solid #e5e7eb;
  1950. text-align: center;
  1951. }
  1952. .footer-content {
  1953. font-size: 13px;
  1954. color: #6b7280;
  1955. line-height: 1.6;
  1956. }
  1957. .footer-link {
  1958. color: #4f46e5;
  1959. text-decoration: none;
  1960. font-weight: 500;
  1961. transition: color 0.2s ease;
  1962. }
  1963. .footer-link:hover {
  1964. color: #7c3aed;
  1965. text-decoration: underline;
  1966. }
  1967. .project-name {
  1968. font-weight: 600;
  1969. color: #374151;
  1970. }
  1971. @media (max-width: 480px) {
  1972. body { padding: 12px; }
  1973. .header { padding: 24px 20px; }
  1974. .content { padding: 20px; }
  1975. .footer { padding: 16px 20px; }
  1976. .header-info { grid-template-columns: 1fr; gap: 12px; }
  1977. .news-header { gap: 6px; }
  1978. .news-content { padding-right: 45px; }
  1979. .news-item { gap: 8px; }
  1980. .new-item { gap: 8px; }
  1981. .news-number { width: 20px; height: 20px; font-size: 12px; }
  1982. .save-buttons {
  1983. position: static;
  1984. margin-bottom: 16px;
  1985. display: flex;
  1986. gap: 8px;
  1987. justify-content: center;
  1988. flex-direction: column;
  1989. width: 100%;
  1990. }
  1991. .save-btn {
  1992. width: 100%;
  1993. }
  1994. }
  1995. </style>
  1996. </head>
  1997. <body>
  1998. <div class="container">
  1999. <div class="header">
  2000. <div class="save-buttons">
  2001. <button class="save-btn" onclick="saveAsImage()">保存为图片</button>
  2002. <button class="save-btn" onclick="saveAsMultipleImages()">分段保存</button>
  2003. </div>
  2004. <div class="header-title">热点新闻分析</div>
  2005. <div class="header-info">
  2006. <div class="info-item">
  2007. <span class="info-label">报告类型</span>
  2008. <span class="info-value">"""
  2009. # 处理报告类型显示
  2010. if is_daily_summary:
  2011. if mode == "current":
  2012. html += "当前榜单"
  2013. elif mode == "incremental":
  2014. html += "增量模式"
  2015. else:
  2016. html += "当日汇总"
  2017. else:
  2018. html += "实时分析"
  2019. html += """</span>
  2020. </div>
  2021. <div class="info-item">
  2022. <span class="info-label">新闻总数</span>
  2023. <span class="info-value">"""
  2024. html += f"{total_titles} 条"
  2025. # 计算筛选后的热点新闻数量
  2026. hot_news_count = sum(len(stat["titles"]) for stat in report_data["stats"])
  2027. html += """</span>
  2028. </div>
  2029. <div class="info-item">
  2030. <span class="info-label">热点新闻</span>
  2031. <span class="info-value">"""
  2032. html += f"{hot_news_count} 条"
  2033. html += """</span>
  2034. </div>
  2035. <div class="info-item">
  2036. <span class="info-label">生成时间</span>
  2037. <span class="info-value">"""
  2038. now = get_beijing_time()
  2039. html += now.strftime("%m-%d %H:%M")
  2040. html += """</span>
  2041. </div>
  2042. </div>
  2043. </div>
  2044. <div class="content">"""
  2045. # 处理失败ID错误信息
  2046. if report_data["failed_ids"]:
  2047. html += """
  2048. <div class="error-section">
  2049. <div class="error-title">⚠️ 请求失败的平台</div>
  2050. <ul class="error-list">"""
  2051. for id_value in report_data["failed_ids"]:
  2052. html += f'<li class="error-item">{html_escape(id_value)}</li>'
  2053. html += """
  2054. </ul>
  2055. </div>"""
  2056. # 生成热点词汇统计部分的HTML
  2057. stats_html = ""
  2058. if report_data["stats"]:
  2059. total_count = len(report_data["stats"])
  2060. for i, stat in enumerate(report_data["stats"], 1):
  2061. count = stat["count"]
  2062. # 确定热度等级
  2063. if count >= 10:
  2064. count_class = "hot"
  2065. elif count >= 5:
  2066. count_class = "warm"
  2067. else:
  2068. count_class = ""
  2069. escaped_word = html_escape(stat["word"])
  2070. stats_html += f"""
  2071. <div class="word-group">
  2072. <div class="word-header">
  2073. <div class="word-info">
  2074. <div class="word-name">{escaped_word}</div>
  2075. <div class="word-count {count_class}">{count} 条</div>
  2076. </div>
  2077. <div class="word-index">{i}/{total_count}</div>
  2078. </div>"""
  2079. # 处理每个词组下的新闻标题,给每条新闻标上序号
  2080. for j, title_data in enumerate(stat["titles"], 1):
  2081. is_new = title_data.get("is_new", False)
  2082. new_class = "new" if is_new else ""
  2083. stats_html += f"""
  2084. <div class="news-item {new_class}">
  2085. <div class="news-number">{j}</div>
  2086. <div class="news-content">
  2087. <div class="news-header">
  2088. <span class="source-name">{html_escape(title_data["source_name"])}</span>"""
  2089. # 处理排名显示
  2090. ranks = title_data.get("ranks", [])
  2091. if ranks:
  2092. min_rank = min(ranks)
  2093. max_rank = max(ranks)
  2094. rank_threshold = title_data.get("rank_threshold", 10)
  2095. # 确定排名等级
  2096. if min_rank <= 3:
  2097. rank_class = "top"
  2098. elif min_rank <= rank_threshold:
  2099. rank_class = "high"
  2100. else:
  2101. rank_class = ""
  2102. if min_rank == max_rank:
  2103. rank_text = str(min_rank)
  2104. else:
  2105. rank_text = f"{min_rank}-{max_rank}"
  2106. stats_html += f'<span class="rank-num {rank_class}">{rank_text}</span>'
  2107. # 处理时间显示
  2108. time_display = title_data.get("time_display", "")
  2109. if time_display:
  2110. # 简化时间显示格式,将波浪线替换为~
  2111. simplified_time = (
  2112. time_display.replace(" ~ ", "~")
  2113. .replace("[", "")
  2114. .replace("]", "")
  2115. )
  2116. stats_html += (
  2117. f'<span class="time-info">{html_escape(simplified_time)}</span>'
  2118. )
  2119. # 处理出现次数
  2120. count_info = title_data.get("count", 1)
  2121. if count_info > 1:
  2122. stats_html += f'<span class="count-info">{count_info}次</span>'
  2123. stats_html += """
  2124. </div>
  2125. <div class="news-title">"""
  2126. # 处理标题和链接
  2127. escaped_title = html_escape(title_data["title"])
  2128. link_url = title_data.get("mobile_url") or title_data.get("url", "")
  2129. if link_url:
  2130. escaped_url = html_escape(link_url)
  2131. stats_html += f'<a href="{escaped_url}" target="_blank" class="news-link">{escaped_title}</a>'
  2132. else:
  2133. stats_html += escaped_title
  2134. stats_html += """
  2135. </div>
  2136. </div>
  2137. </div>"""
  2138. stats_html += """
  2139. </div>"""
  2140. # 生成新增新闻区域的HTML
  2141. new_titles_html = ""
  2142. if report_data["new_titles"]:
  2143. new_titles_html += f"""
  2144. <div class="new-section">
  2145. <div class="new-section-title">本次新增热点 (共 {report_data['total_new_count']} 条)</div>"""
  2146. for source_data in report_data["new_titles"]:
  2147. escaped_source = html_escape(source_data["source_name"])
  2148. titles_count = len(source_data["titles"])
  2149. new_titles_html += f"""
  2150. <div class="new-source-group">
  2151. <div class="new-source-title">{escaped_source} · {titles_count}条</div>"""
  2152. # 为新增新闻也添加序号
  2153. for idx, title_data in enumerate(source_data["titles"], 1):
  2154. ranks = title_data.get("ranks", [])
  2155. # 处理新增新闻的排名显示
  2156. rank_class = ""
  2157. if ranks:
  2158. min_rank = min(ranks)
  2159. if min_rank <= 3:
  2160. rank_class = "top"
  2161. elif min_rank <= title_data.get("rank_threshold", 10):
  2162. rank_class = "high"
  2163. if len(ranks) == 1:
  2164. rank_text = str(ranks[0])
  2165. else:
  2166. rank_text = f"{min(ranks)}-{max(ranks)}"
  2167. else:
  2168. rank_text = "?"
  2169. new_titles_html += f"""
  2170. <div class="new-item">
  2171. <div class="new-item-number">{idx}</div>
  2172. <div class="new-item-rank {rank_class}">{rank_text}</div>
  2173. <div class="new-item-content">
  2174. <div class="new-item-title">"""
  2175. # 处理新增新闻的链接
  2176. escaped_title = html_escape(title_data["title"])
  2177. link_url = title_data.get("mobile_url") or title_data.get("url", "")
  2178. if link_url:
  2179. escaped_url = html_escape(link_url)
  2180. new_titles_html += f'<a href="{escaped_url}" target="_blank" class="news-link">{escaped_title}</a>'
  2181. else:
  2182. new_titles_html += escaped_title
  2183. new_titles_html += """
  2184. </div>
  2185. </div>
  2186. </div>"""
  2187. new_titles_html += """
  2188. </div>"""
  2189. new_titles_html += """
  2190. </div>"""
  2191. # 根据配置决定内容顺序
  2192. if CONFIG.get("REVERSE_CONTENT_ORDER", False):
  2193. # 新增热点在前,热点词汇统计在后
  2194. html += new_titles_html + stats_html
  2195. else:
  2196. # 默认:热点词汇统计在前,新增热点在后
  2197. html += stats_html + new_titles_html
  2198. html += """
  2199. </div>
  2200. <div class="footer">
  2201. <div class="footer-content">
  2202. 由 <span class="project-name">TrendRadar</span> 生成 ·
  2203. <a href="https://github.com/sansan0/TrendRadar" target="_blank" class="footer-link">
  2204. GitHub 开源项目
  2205. </a>"""
  2206. if update_info:
  2207. html += f"""
  2208. <br>
  2209. <span style="color: #ea580c; font-weight: 500;">
  2210. 发现新版本 {update_info['remote_version']},当前版本 {update_info['current_version']}
  2211. </span>"""
  2212. html += """
  2213. </div>
  2214. </div>
  2215. </div>
  2216. <script>
  2217. async function saveAsImage() {
  2218. const button = event.target;
  2219. const originalText = button.textContent;
  2220. try {
  2221. button.textContent = '生成中...';
  2222. button.disabled = true;
  2223. window.scrollTo(0, 0);
  2224. // 等待页面稳定
  2225. await new Promise(resolve => setTimeout(resolve, 200));
  2226. // 截图前隐藏按钮
  2227. const buttons = document.querySelector('.save-buttons');
  2228. buttons.style.visibility = 'hidden';
  2229. // 再次等待确保按钮完全隐藏
  2230. await new Promise(resolve => setTimeout(resolve, 100));
  2231. const container = document.querySelector('.container');
  2232. const canvas = await html2canvas(container, {
  2233. backgroundColor: '#ffffff',
  2234. scale: 1.5,
  2235. useCORS: true,
  2236. allowTaint: false,
  2237. imageTimeout: 10000,
  2238. removeContainer: false,
  2239. foreignObjectRendering: false,
  2240. logging: false,
  2241. width: container.offsetWidth,
  2242. height: container.offsetHeight,
  2243. x: 0,
  2244. y: 0,
  2245. scrollX: 0,
  2246. scrollY: 0,
  2247. windowWidth: window.innerWidth,
  2248. windowHeight: window.innerHeight
  2249. });
  2250. buttons.style.visibility = 'visible';
  2251. const link = document.createElement('a');
  2252. const now = new Date();
  2253. const filename = `TrendRadar_热点新闻分析_${now.getFullYear()}${String(now.getMonth() + 1).padStart(2, '0')}${String(now.getDate()).padStart(2, '0')}_${String(now.getHours()).padStart(2, '0')}${String(now.getMinutes()).padStart(2, '0')}.png`;
  2254. link.download = filename;
  2255. link.href = canvas.toDataURL('image/png', 1.0);
  2256. // 触发下载
  2257. document.body.appendChild(link);
  2258. link.click();
  2259. document.body.removeChild(link);
  2260. button.textContent = '保存成功!';
  2261. setTimeout(() => {
  2262. button.textContent = originalText;
  2263. button.disabled = false;
  2264. }, 2000);
  2265. } catch (error) {
  2266. const buttons = document.querySelector('.save-buttons');
  2267. buttons.style.visibility = 'visible';
  2268. button.textContent = '保存失败';
  2269. setTimeout(() => {
  2270. button.textContent = originalText;
  2271. button.disabled = false;
  2272. }, 2000);
  2273. }
  2274. }
  2275. async function saveAsMultipleImages() {
  2276. const button = event.target;
  2277. const originalText = button.textContent;
  2278. const container = document.querySelector('.container');
  2279. const scale = 1.5;
  2280. const maxHeight = 5000 / scale;
  2281. try {
  2282. button.textContent = '分析中...';
  2283. button.disabled = true;
  2284. // 获取所有可能的分割元素
  2285. const newsItems = Array.from(container.querySelectorAll('.news-item'));
  2286. const wordGroups = Array.from(container.querySelectorAll('.word-group'));
  2287. const newSection = container.querySelector('.new-section');
  2288. const errorSection = container.querySelector('.error-section');
  2289. const header = container.querySelector('.header');
  2290. const footer = container.querySelector('.footer');
  2291. // 计算元素位置和高度
  2292. const containerRect = container.getBoundingClientRect();
  2293. const elements = [];
  2294. // 添加header作为必须包含的元素
  2295. elements.push({
  2296. type: 'header',
  2297. element: header,
  2298. top: 0,
  2299. bottom: header.offsetHeight,
  2300. height: header.offsetHeight
  2301. });
  2302. // 添加错误信息(如果存在)
  2303. if (errorSection) {
  2304. const rect = errorSection.getBoundingClientRect();
  2305. elements.push({
  2306. type: 'error',
  2307. element: errorSection,
  2308. top: rect.top - containerRect.top,
  2309. bottom: rect.bottom - containerRect.top,
  2310. height: rect.height
  2311. });
  2312. }
  2313. // 按word-group分组处理news-item
  2314. wordGroups.forEach(group => {
  2315. const groupRect = group.getBoundingClientRect();
  2316. const groupNewsItems = group.querySelectorAll('.news-item');
  2317. // 添加word-group的header部分
  2318. const wordHeader = group.querySelector('.word-header');
  2319. if (wordHeader) {
  2320. const headerRect = wordHeader.getBoundingClientRect();
  2321. elements.push({
  2322. type: 'word-header',
  2323. element: wordHeader,
  2324. parent: group,
  2325. top: groupRect.top - containerRect.top,
  2326. bottom: headerRect.bottom - containerRect.top,
  2327. height: headerRect.height
  2328. });
  2329. }
  2330. // 添加每个news-item
  2331. groupNewsItems.forEach(item => {
  2332. const rect = item.getBoundingClientRect();
  2333. elements.push({
  2334. type: 'news-item',
  2335. element: item,
  2336. parent: group,
  2337. top: rect.top - containerRect.top,
  2338. bottom: rect.bottom - containerRect.top,
  2339. height: rect.height
  2340. });
  2341. });
  2342. });
  2343. // 添加新增新闻部分
  2344. if (newSection) {
  2345. const rect = newSection.getBoundingClientRect();
  2346. elements.push({
  2347. type: 'new-section',
  2348. element: newSection,
  2349. top: rect.top - containerRect.top,
  2350. bottom: rect.bottom - containerRect.top,
  2351. height: rect.height
  2352. });
  2353. }
  2354. // 添加footer
  2355. const footerRect = footer.getBoundingClientRect();
  2356. elements.push({
  2357. type: 'footer',
  2358. element: footer,
  2359. top: footerRect.top - containerRect.top,
  2360. bottom: footerRect.bottom - containerRect.top,
  2361. height: footer.offsetHeight
  2362. });
  2363. // 计算分割点
  2364. const segments = [];
  2365. let currentSegment = { start: 0, end: 0, height: 0, includeHeader: true };
  2366. let headerHeight = header.offsetHeight;
  2367. currentSegment.height = headerHeight;
  2368. for (let i = 1; i < elements.length; i++) {
  2369. const element = elements[i];
  2370. const potentialHeight = element.bottom - currentSegment.start;
  2371. // 检查是否需要创建新分段
  2372. if (potentialHeight > maxHeight && currentSegment.height > headerHeight) {
  2373. // 在前一个元素结束处分割
  2374. currentSegment.end = elements[i - 1].bottom;
  2375. segments.push(currentSegment);
  2376. // 开始新分段
  2377. currentSegment = {
  2378. start: currentSegment.end,
  2379. end: 0,
  2380. height: element.bottom - currentSegment.end,
  2381. includeHeader: false
  2382. };
  2383. } else {
  2384. currentSegment.height = potentialHeight;
  2385. currentSegment.end = element.bottom;
  2386. }
  2387. }
  2388. // 添加最后一个分段
  2389. if (currentSegment.height > 0) {
  2390. currentSegment.end = container.offsetHeight;
  2391. segments.push(currentSegment);
  2392. }
  2393. button.textContent = `生成中 (0/${segments.length})...`;
  2394. // 隐藏保存按钮
  2395. const buttons = document.querySelector('.save-buttons');
  2396. buttons.style.visibility = 'hidden';
  2397. // 为每个分段生成图片
  2398. const images = [];
  2399. for (let i = 0; i < segments.length; i++) {
  2400. const segment = segments[i];
  2401. button.textContent = `生成中 (${i + 1}/${segments.length})...`;
  2402. // 创建临时容器用于截图
  2403. const tempContainer = document.createElement('div');
  2404. tempContainer.style.cssText = `
  2405. position: absolute;
  2406. left: -9999px;
  2407. top: 0;
  2408. width: ${container.offsetWidth}px;
  2409. background: white;
  2410. `;
  2411. tempContainer.className = 'container';
  2412. // 克隆容器内容
  2413. const clonedContainer = container.cloneNode(true);
  2414. // 移除克隆内容中的保存按钮
  2415. const clonedButtons = clonedContainer.querySelector('.save-buttons');
  2416. if (clonedButtons) {
  2417. clonedButtons.style.display = 'none';
  2418. }
  2419. tempContainer.appendChild(clonedContainer);
  2420. document.body.appendChild(tempContainer);
  2421. // 等待DOM更新
  2422. await new Promise(resolve => setTimeout(resolve, 100));
  2423. // 使用html2canvas截取特定区域
  2424. const canvas = await html2canvas(clonedContainer, {
  2425. backgroundColor: '#ffffff',
  2426. scale: scale,
  2427. useCORS: true,
  2428. allowTaint: false,
  2429. imageTimeout: 10000,
  2430. logging: false,
  2431. width: container.offsetWidth,
  2432. height: segment.end - segment.start,
  2433. x: 0,
  2434. y: segment.start,
  2435. windowWidth: window.innerWidth,
  2436. windowHeight: window.innerHeight
  2437. });
  2438. images.push(canvas.toDataURL('image/png', 1.0));
  2439. // 清理临时容器
  2440. document.body.removeChild(tempContainer);
  2441. }
  2442. // 恢复按钮显示
  2443. buttons.style.visibility = 'visible';
  2444. // 下载所有图片
  2445. const now = new Date();
  2446. const baseFilename = `TrendRadar_热点新闻分析_${now.getFullYear()}${String(now.getMonth() + 1).padStart(2, '0')}${String(now.getDate()).padStart(2, '0')}_${String(now.getHours()).padStart(2, '0')}${String(now.getMinutes()).padStart(2, '0')}`;
  2447. for (let i = 0; i < images.length; i++) {
  2448. const link = document.createElement('a');
  2449. link.download = `${baseFilename}_part${i + 1}.png`;
  2450. link.href = images[i];
  2451. document.body.appendChild(link);
  2452. link.click();
  2453. document.body.removeChild(link);
  2454. // 延迟一下避免浏览器阻止多个下载
  2455. await new Promise(resolve => setTimeout(resolve, 100));
  2456. }
  2457. button.textContent = `已保存 ${segments.length} 张图片!`;
  2458. setTimeout(() => {
  2459. button.textContent = originalText;
  2460. button.disabled = false;
  2461. }, 2000);
  2462. } catch (error) {
  2463. console.error('分段保存失败:', error);
  2464. const buttons = document.querySelector('.save-buttons');
  2465. buttons.style.visibility = 'visible';
  2466. button.textContent = '保存失败';
  2467. setTimeout(() => {
  2468. button.textContent = originalText;
  2469. button.disabled = false;
  2470. }, 2000);
  2471. }
  2472. }
  2473. document.addEventListener('DOMContentLoaded', function() {
  2474. window.scrollTo(0, 0);
  2475. });
  2476. </script>
  2477. </body>
  2478. </html>
  2479. """
  2480. return html
  2481. def render_feishu_content(
  2482. report_data: Dict, update_info: Optional[Dict] = None, mode: str = "daily"
  2483. ) -> str:
  2484. """渲染飞书内容"""
  2485. # 生成热点词汇统计部分
  2486. stats_content = ""
  2487. if report_data["stats"]:
  2488. stats_content += f"📊 **热点词汇统计**\n\n"
  2489. total_count = len(report_data["stats"])
  2490. for i, stat in enumerate(report_data["stats"]):
  2491. word = stat["word"]
  2492. count = stat["count"]
  2493. sequence_display = f"<font color='grey'>[{i + 1}/{total_count}]</font>"
  2494. if count >= 10:
  2495. stats_content += f"🔥 {sequence_display} **{word}** : <font color='red'>{count}</font> 条\n\n"
  2496. elif count >= 5:
  2497. stats_content += f"📈 {sequence_display} **{word}** : <font color='orange'>{count}</font> 条\n\n"
  2498. else:
  2499. stats_content += f"📌 {sequence_display} **{word}** : {count} 条\n\n"
  2500. for j, title_data in enumerate(stat["titles"], 1):
  2501. formatted_title = format_title_for_platform(
  2502. "feishu", title_data, show_source=True
  2503. )
  2504. stats_content += f" {j}. {formatted_title}\n"
  2505. if j < len(stat["titles"]):
  2506. stats_content += "\n"
  2507. if i < len(report_data["stats"]) - 1:
  2508. stats_content += f"\n{CONFIG['FEISHU_MESSAGE_SEPARATOR']}\n\n"
  2509. # 生成新增新闻部分
  2510. new_titles_content = ""
  2511. if report_data["new_titles"]:
  2512. new_titles_content += (
  2513. f"🆕 **本次新增热点新闻** (共 {report_data['total_new_count']} 条)\n\n"
  2514. )
  2515. for source_data in report_data["new_titles"]:
  2516. new_titles_content += (
  2517. f"**{source_data['source_name']}** ({len(source_data['titles'])} 条):\n"
  2518. )
  2519. for j, title_data in enumerate(source_data["titles"], 1):
  2520. title_data_copy = title_data.copy()
  2521. title_data_copy["is_new"] = False
  2522. formatted_title = format_title_for_platform(
  2523. "feishu", title_data_copy, show_source=False
  2524. )
  2525. new_titles_content += f" {j}. {formatted_title}\n"
  2526. new_titles_content += "\n"
  2527. # 根据配置决定内容顺序
  2528. text_content = ""
  2529. if CONFIG.get("REVERSE_CONTENT_ORDER", False):
  2530. # 新增热点在前,热点词汇统计在后
  2531. if new_titles_content:
  2532. text_content += new_titles_content
  2533. if stats_content:
  2534. text_content += f"\n{CONFIG['FEISHU_MESSAGE_SEPARATOR']}\n\n"
  2535. if stats_content:
  2536. text_content += stats_content
  2537. else:
  2538. # 默认:热点词汇统计在前,新增热点在后
  2539. if stats_content:
  2540. text_content += stats_content
  2541. if new_titles_content:
  2542. text_content += f"\n{CONFIG['FEISHU_MESSAGE_SEPARATOR']}\n\n"
  2543. if new_titles_content:
  2544. text_content += new_titles_content
  2545. if not text_content:
  2546. if mode == "incremental":
  2547. mode_text = "增量模式下暂无新增匹配的热点词汇"
  2548. elif mode == "current":
  2549. mode_text = "当前榜单模式下暂无匹配的热点词汇"
  2550. else:
  2551. mode_text = "暂无匹配的热点词汇"
  2552. text_content = f"📭 {mode_text}\n\n"
  2553. if report_data["failed_ids"]:
  2554. if text_content and "暂无匹配" not in text_content:
  2555. text_content += f"\n{CONFIG['FEISHU_MESSAGE_SEPARATOR']}\n\n"
  2556. text_content += "⚠️ **数据获取失败的平台:**\n\n"
  2557. for i, id_value in enumerate(report_data["failed_ids"], 1):
  2558. text_content += f" • <font color='red'>{id_value}</font>\n"
  2559. now = get_beijing_time()
  2560. text_content += (
  2561. f"\n\n<font color='grey'>更新时间:{now.strftime('%Y-%m-%d %H:%M:%S')}</font>"
  2562. )
  2563. if update_info:
  2564. text_content += f"\n<font color='grey'>TrendRadar 发现新版本 {update_info['remote_version']},当前 {update_info['current_version']}</font>"
  2565. return text_content
  2566. def render_dingtalk_content(
  2567. report_data: Dict, update_info: Optional[Dict] = None, mode: str = "daily"
  2568. ) -> str:
  2569. """渲染钉钉内容"""
  2570. total_titles = sum(
  2571. len(stat["titles"]) for stat in report_data["stats"] if stat["count"] > 0
  2572. )
  2573. now = get_beijing_time()
  2574. # 头部信息
  2575. header_content = f"**总新闻数:** {total_titles}\n\n"
  2576. header_content += f"**时间:** {now.strftime('%Y-%m-%d %H:%M:%S')}\n\n"
  2577. header_content += f"**类型:** 热点分析报告\n\n"
  2578. header_content += "---\n\n"
  2579. # 生成热点词汇统计部分
  2580. stats_content = ""
  2581. if report_data["stats"]:
  2582. stats_content += f"📊 **热点词汇统计**\n\n"
  2583. total_count = len(report_data["stats"])
  2584. for i, stat in enumerate(report_data["stats"]):
  2585. word = stat["word"]
  2586. count = stat["count"]
  2587. sequence_display = f"[{i + 1}/{total_count}]"
  2588. if count >= 10:
  2589. stats_content += f"🔥 {sequence_display} **{word}** : **{count}** 条\n\n"
  2590. elif count >= 5:
  2591. stats_content += f"📈 {sequence_display} **{word}** : **{count}** 条\n\n"
  2592. else:
  2593. stats_content += f"📌 {sequence_display} **{word}** : {count} 条\n\n"
  2594. for j, title_data in enumerate(stat["titles"], 1):
  2595. formatted_title = format_title_for_platform(
  2596. "dingtalk", title_data, show_source=True
  2597. )
  2598. stats_content += f" {j}. {formatted_title}\n"
  2599. if j < len(stat["titles"]):
  2600. stats_content += "\n"
  2601. if i < len(report_data["stats"]) - 1:
  2602. stats_content += f"\n---\n\n"
  2603. # 生成新增新闻部分
  2604. new_titles_content = ""
  2605. if report_data["new_titles"]:
  2606. new_titles_content += (
  2607. f"🆕 **本次新增热点新闻** (共 {report_data['total_new_count']} 条)\n\n"
  2608. )
  2609. for source_data in report_data["new_titles"]:
  2610. new_titles_content += f"**{source_data['source_name']}** ({len(source_data['titles'])} 条):\n\n"
  2611. for j, title_data in enumerate(source_data["titles"], 1):
  2612. title_data_copy = title_data.copy()
  2613. title_data_copy["is_new"] = False
  2614. formatted_title = format_title_for_platform(
  2615. "dingtalk", title_data_copy, show_source=False
  2616. )
  2617. new_titles_content += f" {j}. {formatted_title}\n"
  2618. new_titles_content += "\n"
  2619. # 根据配置决定内容顺序
  2620. text_content = header_content
  2621. if CONFIG.get("REVERSE_CONTENT_ORDER", False):
  2622. # 新增热点在前,热点词汇统计在后
  2623. if new_titles_content:
  2624. text_content += new_titles_content
  2625. if stats_content:
  2626. text_content += f"\n---\n\n"
  2627. if stats_content:
  2628. text_content += stats_content
  2629. else:
  2630. # 默认:热点词汇统计在前,新增热点在后
  2631. if stats_content:
  2632. text_content += stats_content
  2633. if new_titles_content:
  2634. text_content += f"\n---\n\n"
  2635. if new_titles_content:
  2636. text_content += new_titles_content
  2637. if not stats_content and not new_titles_content:
  2638. if mode == "incremental":
  2639. mode_text = "增量模式下暂无新增匹配的热点词汇"
  2640. elif mode == "current":
  2641. mode_text = "当前榜单模式下暂无匹配的热点词汇"
  2642. else:
  2643. mode_text = "暂无匹配的热点词汇"
  2644. text_content += f"📭 {mode_text}\n\n"
  2645. if report_data["failed_ids"]:
  2646. if "暂无匹配" not in text_content:
  2647. text_content += f"\n---\n\n"
  2648. text_content += "⚠️ **数据获取失败的平台:**\n\n"
  2649. for i, id_value in enumerate(report_data["failed_ids"], 1):
  2650. text_content += f" • **{id_value}**\n"
  2651. text_content += f"\n\n> 更新时间:{now.strftime('%Y-%m-%d %H:%M:%S')}"
  2652. if update_info:
  2653. text_content += f"\n> TrendRadar 发现新版本 **{update_info['remote_version']}**,当前 **{update_info['current_version']}**"
  2654. return text_content
  2655. def _get_batch_header(format_type: str, batch_num: int, total_batches: int) -> str:
  2656. """根据 format_type 生成对应格式的批次头部"""
  2657. if format_type == "telegram":
  2658. return f"<b>[第 {batch_num}/{total_batches} 批次]</b>\n\n"
  2659. elif format_type == "slack":
  2660. return f"*[第 {batch_num}/{total_batches} 批次]*\n\n"
  2661. elif format_type in ("wework_text", "bark"):
  2662. # 企业微信文本模式和 Bark 使用纯文本格式
  2663. return f"[第 {batch_num}/{total_batches} 批次]\n\n"
  2664. else:
  2665. # 飞书、钉钉、ntfy、企业微信 markdown 模式
  2666. return f"**[第 {batch_num}/{total_batches} 批次]**\n\n"
  2667. def _get_max_batch_header_size(format_type: str) -> int:
  2668. """估算批次头部的最大字节数(假设最多 99 批次)
  2669. 用于在分批时预留空间,避免事后截断破坏内容完整性。
  2670. """
  2671. # 生成最坏情况的头部(99/99 批次)
  2672. max_header = _get_batch_header(format_type, 99, 99)
  2673. return len(max_header.encode("utf-8"))
  2674. def _truncate_to_bytes(text: str, max_bytes: int) -> str:
  2675. """安全截断字符串到指定字节数,避免截断多字节字符"""
  2676. text_bytes = text.encode("utf-8")
  2677. if len(text_bytes) <= max_bytes:
  2678. return text
  2679. # 截断到指定字节数
  2680. truncated = text_bytes[:max_bytes]
  2681. # 处理可能的不完整 UTF-8 字符
  2682. for i in range(min(4, len(truncated))):
  2683. try:
  2684. return truncated[: len(truncated) - i].decode("utf-8")
  2685. except UnicodeDecodeError:
  2686. continue
  2687. # 极端情况:返回空字符串
  2688. return ""
  2689. def add_batch_headers(
  2690. batches: List[str], format_type: str, max_bytes: int
  2691. ) -> List[str]:
  2692. """为批次添加头部,动态计算确保总大小不超过限制
  2693. Args:
  2694. batches: 原始批次列表
  2695. format_type: 推送类型(bark, telegram, feishu 等)
  2696. max_bytes: 该推送类型的最大字节限制
  2697. Returns:
  2698. 添加头部后的批次列表
  2699. """
  2700. if len(batches) <= 1:
  2701. return batches
  2702. total = len(batches)
  2703. result = []
  2704. for i, content in enumerate(batches, 1):
  2705. # 生成批次头部
  2706. header = _get_batch_header(format_type, i, total)
  2707. header_size = len(header.encode("utf-8"))
  2708. # 动态计算允许的最大内容大小
  2709. max_content_size = max_bytes - header_size
  2710. content_size = len(content.encode("utf-8"))
  2711. # 如果超出,截断到安全大小
  2712. if content_size > max_content_size:
  2713. print(
  2714. f"警告:{format_type} 第 {i}/{total} 批次内容({content_size}字节) + 头部({header_size}字节) 超出限制({max_bytes}字节),截断到 {max_content_size} 字节"
  2715. )
  2716. content = _truncate_to_bytes(content, max_content_size)
  2717. result.append(header + content)
  2718. return result
  2719. def split_content_into_batches(
  2720. report_data: Dict,
  2721. format_type: str,
  2722. update_info: Optional[Dict] = None,
  2723. max_bytes: int = None,
  2724. mode: str = "daily",
  2725. ) -> List[str]:
  2726. """分批处理消息内容,确保词组标题+至少第一条新闻的完整性"""
  2727. if max_bytes is None:
  2728. if format_type == "dingtalk":
  2729. max_bytes = CONFIG.get("DINGTALK_BATCH_SIZE", 20000)
  2730. elif format_type == "feishu":
  2731. max_bytes = CONFIG.get("FEISHU_BATCH_SIZE", 29000)
  2732. elif format_type == "ntfy":
  2733. max_bytes = 3800
  2734. else:
  2735. max_bytes = CONFIG.get("MESSAGE_BATCH_SIZE", 4000)
  2736. batches = []
  2737. total_titles = sum(
  2738. len(stat["titles"]) for stat in report_data["stats"] if stat["count"] > 0
  2739. )
  2740. now = get_beijing_time()
  2741. base_header = ""
  2742. if format_type in ("wework", "bark"):
  2743. base_header = f"**总新闻数:** {total_titles}\n\n\n\n"
  2744. elif format_type == "telegram":
  2745. base_header = f"总新闻数: {total_titles}\n\n"
  2746. elif format_type == "ntfy":
  2747. base_header = f"**总新闻数:** {total_titles}\n\n"
  2748. elif format_type == "feishu":
  2749. base_header = ""
  2750. elif format_type == "dingtalk":
  2751. base_header = f"**总新闻数:** {total_titles}\n\n"
  2752. base_header += f"**时间:** {now.strftime('%Y-%m-%d %H:%M:%S')}\n\n"
  2753. base_header += f"**类型:** 热点分析报告\n\n"
  2754. base_header += "---\n\n"
  2755. elif format_type == "slack":
  2756. base_header = f"*总新闻数:* {total_titles}\n\n"
  2757. base_footer = ""
  2758. if format_type in ("wework", "bark"):
  2759. base_footer = f"\n\n\n> 更新时间:{now.strftime('%Y-%m-%d %H:%M:%S')}"
  2760. if update_info:
  2761. base_footer += f"\n> TrendRadar 发现新版本 **{update_info['remote_version']}**,当前 **{update_info['current_version']}**"
  2762. elif format_type == "telegram":
  2763. base_footer = f"\n\n更新时间:{now.strftime('%Y-%m-%d %H:%M:%S')}"
  2764. if update_info:
  2765. base_footer += f"\nTrendRadar 发现新版本 {update_info['remote_version']},当前 {update_info['current_version']}"
  2766. elif format_type == "ntfy":
  2767. base_footer = f"\n\n> 更新时间:{now.strftime('%Y-%m-%d %H:%M:%S')}"
  2768. if update_info:
  2769. base_footer += f"\n> TrendRadar 发现新版本 **{update_info['remote_version']}**,当前 **{update_info['current_version']}**"
  2770. elif format_type == "feishu":
  2771. base_footer = f"\n\n<font color='grey'>更新时间:{now.strftime('%Y-%m-%d %H:%M:%S')}</font>"
  2772. if update_info:
  2773. base_footer += f"\n<font color='grey'>TrendRadar 发现新版本 {update_info['remote_version']},当前 {update_info['current_version']}</font>"
  2774. elif format_type == "dingtalk":
  2775. base_footer = f"\n\n> 更新时间:{now.strftime('%Y-%m-%d %H:%M:%S')}"
  2776. if update_info:
  2777. base_footer += f"\n> TrendRadar 发现新版本 **{update_info['remote_version']}**,当前 **{update_info['current_version']}**"
  2778. elif format_type == "slack":
  2779. base_footer = f"\n\n_更新时间:{now.strftime('%Y-%m-%d %H:%M:%S')}_"
  2780. if update_info:
  2781. base_footer += f"\n_TrendRadar 发现新版本 *{update_info['remote_version']}*,当前 *{update_info['current_version']}_"
  2782. stats_header = ""
  2783. if report_data["stats"]:
  2784. if format_type in ("wework", "bark"):
  2785. stats_header = f"📊 **热点词汇统计**\n\n"
  2786. elif format_type == "telegram":
  2787. stats_header = f"📊 热点词汇统计\n\n"
  2788. elif format_type == "ntfy":
  2789. stats_header = f"📊 **热点词汇统计**\n\n"
  2790. elif format_type == "feishu":
  2791. stats_header = f"📊 **热点词汇统计**\n\n"
  2792. elif format_type == "dingtalk":
  2793. stats_header = f"📊 **热点词汇统计**\n\n"
  2794. elif format_type == "slack":
  2795. stats_header = f"📊 *热点词汇统计*\n\n"
  2796. current_batch = base_header
  2797. current_batch_has_content = False
  2798. if (
  2799. not report_data["stats"]
  2800. and not report_data["new_titles"]
  2801. and not report_data["failed_ids"]
  2802. ):
  2803. if mode == "incremental":
  2804. mode_text = "增量模式下暂无新增匹配的热点词汇"
  2805. elif mode == "current":
  2806. mode_text = "当前榜单模式下暂无匹配的热点词汇"
  2807. else:
  2808. mode_text = "暂无匹配的热点词汇"
  2809. simple_content = f"📭 {mode_text}\n\n"
  2810. final_content = base_header + simple_content + base_footer
  2811. batches.append(final_content)
  2812. return batches
  2813. # 定义处理热点词汇统计的函数
  2814. def process_stats_section(current_batch, current_batch_has_content, batches):
  2815. """处理热点词汇统计"""
  2816. if not report_data["stats"]:
  2817. return current_batch, current_batch_has_content, batches
  2818. total_count = len(report_data["stats"])
  2819. # 添加统计标题
  2820. test_content = current_batch + stats_header
  2821. if (
  2822. len(test_content.encode("utf-8")) + len(base_footer.encode("utf-8"))
  2823. < max_bytes
  2824. ):
  2825. current_batch = test_content
  2826. current_batch_has_content = True
  2827. else:
  2828. if current_batch_has_content:
  2829. batches.append(current_batch + base_footer)
  2830. current_batch = base_header + stats_header
  2831. current_batch_has_content = True
  2832. # 逐个处理词组(确保词组标题+第一条新闻的原子性)
  2833. for i, stat in enumerate(report_data["stats"]):
  2834. word = stat["word"]
  2835. count = stat["count"]
  2836. sequence_display = f"[{i + 1}/{total_count}]"
  2837. # 构建词组标题
  2838. word_header = ""
  2839. if format_type in ("wework", "bark"):
  2840. if count >= 10:
  2841. word_header = (
  2842. f"🔥 {sequence_display} **{word}** : **{count}** 条\n\n"
  2843. )
  2844. elif count >= 5:
  2845. word_header = (
  2846. f"📈 {sequence_display} **{word}** : **{count}** 条\n\n"
  2847. )
  2848. else:
  2849. word_header = f"📌 {sequence_display} **{word}** : {count} 条\n\n"
  2850. elif format_type == "telegram":
  2851. if count >= 10:
  2852. word_header = f"🔥 {sequence_display} {word} : {count} 条\n\n"
  2853. elif count >= 5:
  2854. word_header = f"📈 {sequence_display} {word} : {count} 条\n\n"
  2855. else:
  2856. word_header = f"📌 {sequence_display} {word} : {count} 条\n\n"
  2857. elif format_type == "ntfy":
  2858. if count >= 10:
  2859. word_header = (
  2860. f"🔥 {sequence_display} **{word}** : **{count}** 条\n\n"
  2861. )
  2862. elif count >= 5:
  2863. word_header = (
  2864. f"📈 {sequence_display} **{word}** : **{count}** 条\n\n"
  2865. )
  2866. else:
  2867. word_header = f"📌 {sequence_display} **{word}** : {count} 条\n\n"
  2868. elif format_type == "feishu":
  2869. if count >= 10:
  2870. word_header = f"🔥 <font color='grey'>{sequence_display}</font> **{word}** : <font color='red'>{count}</font> 条\n\n"
  2871. elif count >= 5:
  2872. word_header = f"📈 <font color='grey'>{sequence_display}</font> **{word}** : <font color='orange'>{count}</font> 条\n\n"
  2873. else:
  2874. word_header = f"📌 <font color='grey'>{sequence_display}</font> **{word}** : {count} 条\n\n"
  2875. elif format_type == "dingtalk":
  2876. if count >= 10:
  2877. word_header = (
  2878. f"🔥 {sequence_display} **{word}** : **{count}** 条\n\n"
  2879. )
  2880. elif count >= 5:
  2881. word_header = (
  2882. f"📈 {sequence_display} **{word}** : **{count}** 条\n\n"
  2883. )
  2884. else:
  2885. word_header = f"📌 {sequence_display} **{word}** : {count} 条\n\n"
  2886. elif format_type == "slack":
  2887. if count >= 10:
  2888. word_header = (
  2889. f"🔥 {sequence_display} *{word}* : *{count}* 条\n\n"
  2890. )
  2891. elif count >= 5:
  2892. word_header = (
  2893. f"📈 {sequence_display} *{word}* : *{count}* 条\n\n"
  2894. )
  2895. else:
  2896. word_header = f"📌 {sequence_display} *{word}* : {count} 条\n\n"
  2897. # 构建第一条新闻
  2898. first_news_line = ""
  2899. if stat["titles"]:
  2900. first_title_data = stat["titles"][0]
  2901. if format_type in ("wework", "bark"):
  2902. formatted_title = format_title_for_platform(
  2903. "wework", first_title_data, show_source=True
  2904. )
  2905. elif format_type == "telegram":
  2906. formatted_title = format_title_for_platform(
  2907. "telegram", first_title_data, show_source=True
  2908. )
  2909. elif format_type == "ntfy":
  2910. formatted_title = format_title_for_platform(
  2911. "ntfy", first_title_data, show_source=True
  2912. )
  2913. elif format_type == "feishu":
  2914. formatted_title = format_title_for_platform(
  2915. "feishu", first_title_data, show_source=True
  2916. )
  2917. elif format_type == "dingtalk":
  2918. formatted_title = format_title_for_platform(
  2919. "dingtalk", first_title_data, show_source=True
  2920. )
  2921. elif format_type == "slack":
  2922. formatted_title = format_title_for_platform(
  2923. "slack", first_title_data, show_source=True
  2924. )
  2925. else:
  2926. formatted_title = f"{first_title_data['title']}"
  2927. first_news_line = f" 1. {formatted_title}\n"
  2928. if len(stat["titles"]) > 1:
  2929. first_news_line += "\n"
  2930. # 原子性检查:词组标题+第一条新闻必须一起处理
  2931. word_with_first_news = word_header + first_news_line
  2932. test_content = current_batch + word_with_first_news
  2933. if (
  2934. len(test_content.encode("utf-8")) + len(base_footer.encode("utf-8"))
  2935. >= max_bytes
  2936. ):
  2937. # 当前批次容纳不下,开启新批次
  2938. if current_batch_has_content:
  2939. batches.append(current_batch + base_footer)
  2940. current_batch = base_header + stats_header + word_with_first_news
  2941. current_batch_has_content = True
  2942. start_index = 1
  2943. else:
  2944. current_batch = test_content
  2945. current_batch_has_content = True
  2946. start_index = 1
  2947. # 处理剩余新闻条目
  2948. for j in range(start_index, len(stat["titles"])):
  2949. title_data = stat["titles"][j]
  2950. if format_type in ("wework", "bark"):
  2951. formatted_title = format_title_for_platform(
  2952. "wework", title_data, show_source=True
  2953. )
  2954. elif format_type == "telegram":
  2955. formatted_title = format_title_for_platform(
  2956. "telegram", title_data, show_source=True
  2957. )
  2958. elif format_type == "ntfy":
  2959. formatted_title = format_title_for_platform(
  2960. "ntfy", title_data, show_source=True
  2961. )
  2962. elif format_type == "feishu":
  2963. formatted_title = format_title_for_platform(
  2964. "feishu", title_data, show_source=True
  2965. )
  2966. elif format_type == "dingtalk":
  2967. formatted_title = format_title_for_platform(
  2968. "dingtalk", title_data, show_source=True
  2969. )
  2970. elif format_type == "slack":
  2971. formatted_title = format_title_for_platform(
  2972. "slack", title_data, show_source=True
  2973. )
  2974. else:
  2975. formatted_title = f"{title_data['title']}"
  2976. news_line = f" {j + 1}. {formatted_title}\n"
  2977. if j < len(stat["titles"]) - 1:
  2978. news_line += "\n"
  2979. test_content = current_batch + news_line
  2980. if (
  2981. len(test_content.encode("utf-8")) + len(base_footer.encode("utf-8"))
  2982. >= max_bytes
  2983. ):
  2984. if current_batch_has_content:
  2985. batches.append(current_batch + base_footer)
  2986. current_batch = base_header + stats_header + word_header + news_line
  2987. current_batch_has_content = True
  2988. else:
  2989. current_batch = test_content
  2990. current_batch_has_content = True
  2991. # 词组间分隔符
  2992. if i < len(report_data["stats"]) - 1:
  2993. separator = ""
  2994. if format_type in ("wework", "bark"):
  2995. separator = f"\n\n\n\n"
  2996. elif format_type == "telegram":
  2997. separator = f"\n\n"
  2998. elif format_type == "ntfy":
  2999. separator = f"\n\n"
  3000. elif format_type == "feishu":
  3001. separator = f"\n{CONFIG['FEISHU_MESSAGE_SEPARATOR']}\n\n"
  3002. elif format_type == "dingtalk":
  3003. separator = f"\n---\n\n"
  3004. elif format_type == "slack":
  3005. separator = f"\n\n"
  3006. test_content = current_batch + separator
  3007. if (
  3008. len(test_content.encode("utf-8")) + len(base_footer.encode("utf-8"))
  3009. < max_bytes
  3010. ):
  3011. current_batch = test_content
  3012. return current_batch, current_batch_has_content, batches
  3013. # 定义处理新增新闻的函数
  3014. def process_new_titles_section(current_batch, current_batch_has_content, batches):
  3015. """处理新增新闻"""
  3016. if not report_data["new_titles"]:
  3017. return current_batch, current_batch_has_content, batches
  3018. new_header = ""
  3019. if format_type in ("wework", "bark"):
  3020. new_header = f"\n\n\n\n🆕 **本次新增热点新闻** (共 {report_data['total_new_count']} 条)\n\n"
  3021. elif format_type == "telegram":
  3022. new_header = (
  3023. f"\n\n🆕 本次新增热点新闻 (共 {report_data['total_new_count']} 条)\n\n"
  3024. )
  3025. elif format_type == "ntfy":
  3026. new_header = f"\n\n🆕 **本次新增热点新闻** (共 {report_data['total_new_count']} 条)\n\n"
  3027. elif format_type == "feishu":
  3028. new_header = f"\n{CONFIG['FEISHU_MESSAGE_SEPARATOR']}\n\n🆕 **本次新增热点新闻** (共 {report_data['total_new_count']} 条)\n\n"
  3029. elif format_type == "dingtalk":
  3030. new_header = f"\n---\n\n🆕 **本次新增热点新闻** (共 {report_data['total_new_count']} 条)\n\n"
  3031. elif format_type == "slack":
  3032. new_header = f"\n\n🆕 *本次新增热点新闻* (共 {report_data['total_new_count']} 条)\n\n"
  3033. test_content = current_batch + new_header
  3034. if (
  3035. len(test_content.encode("utf-8")) + len(base_footer.encode("utf-8"))
  3036. >= max_bytes
  3037. ):
  3038. if current_batch_has_content:
  3039. batches.append(current_batch + base_footer)
  3040. current_batch = base_header + new_header
  3041. current_batch_has_content = True
  3042. else:
  3043. current_batch = test_content
  3044. current_batch_has_content = True
  3045. # 逐个处理新增新闻来源
  3046. for source_data in report_data["new_titles"]:
  3047. source_header = ""
  3048. if format_type in ("wework", "bark"):
  3049. source_header = f"**{source_data['source_name']}** ({len(source_data['titles'])} 条):\n\n"
  3050. elif format_type == "telegram":
  3051. source_header = f"{source_data['source_name']} ({len(source_data['titles'])} 条):\n\n"
  3052. elif format_type == "ntfy":
  3053. source_header = f"**{source_data['source_name']}** ({len(source_data['titles'])} 条):\n\n"
  3054. elif format_type == "feishu":
  3055. source_header = f"**{source_data['source_name']}** ({len(source_data['titles'])} 条):\n\n"
  3056. elif format_type == "dingtalk":
  3057. source_header = f"**{source_data['source_name']}** ({len(source_data['titles'])} 条):\n\n"
  3058. elif format_type == "slack":
  3059. source_header = f"*{source_data['source_name']}* ({len(source_data['titles'])} 条):\n\n"
  3060. # 构建第一条新增新闻
  3061. first_news_line = ""
  3062. if source_data["titles"]:
  3063. first_title_data = source_data["titles"][0]
  3064. title_data_copy = first_title_data.copy()
  3065. title_data_copy["is_new"] = False
  3066. if format_type in ("wework", "bark"):
  3067. formatted_title = format_title_for_platform(
  3068. "wework", title_data_copy, show_source=False
  3069. )
  3070. elif format_type == "telegram":
  3071. formatted_title = format_title_for_platform(
  3072. "telegram", title_data_copy, show_source=False
  3073. )
  3074. elif format_type == "feishu":
  3075. formatted_title = format_title_for_platform(
  3076. "feishu", title_data_copy, show_source=False
  3077. )
  3078. elif format_type == "dingtalk":
  3079. formatted_title = format_title_for_platform(
  3080. "dingtalk", title_data_copy, show_source=False
  3081. )
  3082. elif format_type == "slack":
  3083. formatted_title = format_title_for_platform(
  3084. "slack", title_data_copy, show_source=False
  3085. )
  3086. else:
  3087. formatted_title = f"{title_data_copy['title']}"
  3088. first_news_line = f" 1. {formatted_title}\n"
  3089. # 原子性检查:来源标题+第一条新闻
  3090. source_with_first_news = source_header + first_news_line
  3091. test_content = current_batch + source_with_first_news
  3092. if (
  3093. len(test_content.encode("utf-8")) + len(base_footer.encode("utf-8"))
  3094. >= max_bytes
  3095. ):
  3096. if current_batch_has_content:
  3097. batches.append(current_batch + base_footer)
  3098. current_batch = base_header + new_header + source_with_first_news
  3099. current_batch_has_content = True
  3100. start_index = 1
  3101. else:
  3102. current_batch = test_content
  3103. current_batch_has_content = True
  3104. start_index = 1
  3105. # 处理剩余新增新闻
  3106. for j in range(start_index, len(source_data["titles"])):
  3107. title_data = source_data["titles"][j]
  3108. title_data_copy = title_data.copy()
  3109. title_data_copy["is_new"] = False
  3110. if format_type == "wework":
  3111. formatted_title = format_title_for_platform(
  3112. "wework", title_data_copy, show_source=False
  3113. )
  3114. elif format_type == "telegram":
  3115. formatted_title = format_title_for_platform(
  3116. "telegram", title_data_copy, show_source=False
  3117. )
  3118. elif format_type == "feishu":
  3119. formatted_title = format_title_for_platform(
  3120. "feishu", title_data_copy, show_source=False
  3121. )
  3122. elif format_type == "dingtalk":
  3123. formatted_title = format_title_for_platform(
  3124. "dingtalk", title_data_copy, show_source=False
  3125. )
  3126. elif format_type == "slack":
  3127. formatted_title = format_title_for_platform(
  3128. "slack", title_data_copy, show_source=False
  3129. )
  3130. else:
  3131. formatted_title = f"{title_data_copy['title']}"
  3132. news_line = f" {j + 1}. {formatted_title}\n"
  3133. test_content = current_batch + news_line
  3134. if (
  3135. len(test_content.encode("utf-8")) + len(base_footer.encode("utf-8"))
  3136. >= max_bytes
  3137. ):
  3138. if current_batch_has_content:
  3139. batches.append(current_batch + base_footer)
  3140. current_batch = base_header + new_header + source_header + news_line
  3141. current_batch_has_content = True
  3142. else:
  3143. current_batch = test_content
  3144. current_batch_has_content = True
  3145. current_batch += "\n"
  3146. return current_batch, current_batch_has_content, batches
  3147. # 根据配置决定处理顺序
  3148. if CONFIG.get("REVERSE_CONTENT_ORDER", False):
  3149. # 新增热点在前,热点词汇统计在后
  3150. current_batch, current_batch_has_content, batches = process_new_titles_section(
  3151. current_batch, current_batch_has_content, batches
  3152. )
  3153. current_batch, current_batch_has_content, batches = process_stats_section(
  3154. current_batch, current_batch_has_content, batches
  3155. )
  3156. else:
  3157. # 默认:热点词汇统计在前,新增热点在后
  3158. current_batch, current_batch_has_content, batches = process_stats_section(
  3159. current_batch, current_batch_has_content, batches
  3160. )
  3161. current_batch, current_batch_has_content, batches = process_new_titles_section(
  3162. current_batch, current_batch_has_content, batches
  3163. )
  3164. if report_data["failed_ids"]:
  3165. failed_header = ""
  3166. if format_type == "wework":
  3167. failed_header = f"\n\n\n\n⚠️ **数据获取失败的平台:**\n\n"
  3168. elif format_type == "telegram":
  3169. failed_header = f"\n\n⚠️ 数据获取失败的平台:\n\n"
  3170. elif format_type == "ntfy":
  3171. failed_header = f"\n\n⚠️ **数据获取失败的平台:**\n\n"
  3172. elif format_type == "feishu":
  3173. failed_header = f"\n{CONFIG['FEISHU_MESSAGE_SEPARATOR']}\n\n⚠️ **数据获取失败的平台:**\n\n"
  3174. elif format_type == "dingtalk":
  3175. failed_header = f"\n---\n\n⚠️ **数据获取失败的平台:**\n\n"
  3176. test_content = current_batch + failed_header
  3177. if (
  3178. len(test_content.encode("utf-8")) + len(base_footer.encode("utf-8"))
  3179. >= max_bytes
  3180. ):
  3181. if current_batch_has_content:
  3182. batches.append(current_batch + base_footer)
  3183. current_batch = base_header + failed_header
  3184. current_batch_has_content = True
  3185. else:
  3186. current_batch = test_content
  3187. current_batch_has_content = True
  3188. for i, id_value in enumerate(report_data["failed_ids"], 1):
  3189. if format_type == "feishu":
  3190. failed_line = f" • <font color='red'>{id_value}</font>\n"
  3191. elif format_type == "dingtalk":
  3192. failed_line = f" • **{id_value}**\n"
  3193. else:
  3194. failed_line = f" • {id_value}\n"
  3195. test_content = current_batch + failed_line
  3196. if (
  3197. len(test_content.encode("utf-8")) + len(base_footer.encode("utf-8"))
  3198. >= max_bytes
  3199. ):
  3200. if current_batch_has_content:
  3201. batches.append(current_batch + base_footer)
  3202. current_batch = base_header + failed_header + failed_line
  3203. current_batch_has_content = True
  3204. else:
  3205. current_batch = test_content
  3206. current_batch_has_content = True
  3207. # 完成最后批次
  3208. if current_batch_has_content:
  3209. batches.append(current_batch + base_footer)
  3210. return batches
  3211. def send_to_notifications(
  3212. stats: List[Dict],
  3213. failed_ids: Optional[List] = None,
  3214. report_type: str = "当日汇总",
  3215. new_titles: Optional[Dict] = None,
  3216. id_to_name: Optional[Dict] = None,
  3217. update_info: Optional[Dict] = None,
  3218. proxy_url: Optional[str] = None,
  3219. mode: str = "daily",
  3220. html_file_path: Optional[str] = None,
  3221. ) -> Dict[str, bool]:
  3222. """发送数据到多个通知平台(支持多账号)"""
  3223. results = {}
  3224. max_accounts = CONFIG["MAX_ACCOUNTS_PER_CHANNEL"]
  3225. if CONFIG["PUSH_WINDOW"]["ENABLED"]:
  3226. push_manager = PushRecordManager()
  3227. time_range_start = CONFIG["PUSH_WINDOW"]["TIME_RANGE"]["START"]
  3228. time_range_end = CONFIG["PUSH_WINDOW"]["TIME_RANGE"]["END"]
  3229. if not push_manager.is_in_time_range(time_range_start, time_range_end):
  3230. now = get_beijing_time()
  3231. print(
  3232. f"推送窗口控制:当前时间 {now.strftime('%H:%M')} 不在推送时间窗口 {time_range_start}-{time_range_end} 内,跳过推送"
  3233. )
  3234. return results
  3235. if CONFIG["PUSH_WINDOW"]["ONCE_PER_DAY"]:
  3236. if push_manager.has_pushed_today():
  3237. print(f"推送窗口控制:今天已推送过,跳过本次推送")
  3238. return results
  3239. else:
  3240. print(f"推送窗口控制:今天首次推送")
  3241. report_data = prepare_report_data(stats, failed_ids, new_titles, id_to_name, mode)
  3242. update_info_to_send = update_info if CONFIG["SHOW_VERSION_UPDATE"] else None
  3243. # 发送到飞书(多账号)
  3244. feishu_urls = parse_multi_account_config(CONFIG["FEISHU_WEBHOOK_URL"])
  3245. if feishu_urls:
  3246. feishu_urls = limit_accounts(feishu_urls, max_accounts, "飞书")
  3247. feishu_results = []
  3248. for i, url in enumerate(feishu_urls):
  3249. if url: # 跳过空值
  3250. account_label = f"账号{i+1}" if len(feishu_urls) > 1 else ""
  3251. result = send_to_feishu(
  3252. url, report_data, report_type, update_info_to_send, proxy_url, mode, account_label
  3253. )
  3254. feishu_results.append(result)
  3255. results["feishu"] = any(feishu_results) if feishu_results else False
  3256. # 发送到钉钉(多账号)
  3257. dingtalk_urls = parse_multi_account_config(CONFIG["DINGTALK_WEBHOOK_URL"])
  3258. if dingtalk_urls:
  3259. dingtalk_urls = limit_accounts(dingtalk_urls, max_accounts, "钉钉")
  3260. dingtalk_results = []
  3261. for i, url in enumerate(dingtalk_urls):
  3262. if url:
  3263. account_label = f"账号{i+1}" if len(dingtalk_urls) > 1 else ""
  3264. result = send_to_dingtalk(
  3265. url, report_data, report_type, update_info_to_send, proxy_url, mode, account_label
  3266. )
  3267. dingtalk_results.append(result)
  3268. results["dingtalk"] = any(dingtalk_results) if dingtalk_results else False
  3269. # 发送到企业微信(多账号)
  3270. wework_urls = parse_multi_account_config(CONFIG["WEWORK_WEBHOOK_URL"])
  3271. if wework_urls:
  3272. wework_urls = limit_accounts(wework_urls, max_accounts, "企业微信")
  3273. wework_results = []
  3274. for i, url in enumerate(wework_urls):
  3275. if url:
  3276. account_label = f"账号{i+1}" if len(wework_urls) > 1 else ""
  3277. result = send_to_wework(
  3278. url, report_data, report_type, update_info_to_send, proxy_url, mode, account_label
  3279. )
  3280. wework_results.append(result)
  3281. results["wework"] = any(wework_results) if wework_results else False
  3282. # 发送到 Telegram(多账号,需验证配对)
  3283. telegram_tokens = parse_multi_account_config(CONFIG["TELEGRAM_BOT_TOKEN"])
  3284. telegram_chat_ids = parse_multi_account_config(CONFIG["TELEGRAM_CHAT_ID"])
  3285. if telegram_tokens and telegram_chat_ids:
  3286. valid, count = validate_paired_configs(
  3287. {"bot_token": telegram_tokens, "chat_id": telegram_chat_ids},
  3288. "Telegram",
  3289. required_keys=["bot_token", "chat_id"]
  3290. )
  3291. if valid and count > 0:
  3292. telegram_tokens = limit_accounts(telegram_tokens, max_accounts, "Telegram")
  3293. telegram_chat_ids = telegram_chat_ids[:len(telegram_tokens)] # 保持数量一致
  3294. telegram_results = []
  3295. for i in range(len(telegram_tokens)):
  3296. token = telegram_tokens[i]
  3297. chat_id = telegram_chat_ids[i]
  3298. if token and chat_id:
  3299. account_label = f"账号{i+1}" if len(telegram_tokens) > 1 else ""
  3300. result = send_to_telegram(
  3301. token, chat_id, report_data, report_type,
  3302. update_info_to_send, proxy_url, mode, account_label
  3303. )
  3304. telegram_results.append(result)
  3305. results["telegram"] = any(telegram_results) if telegram_results else False
  3306. # 发送到 ntfy(多账号,需验证配对)
  3307. ntfy_server_url = CONFIG["NTFY_SERVER_URL"]
  3308. ntfy_topics = parse_multi_account_config(CONFIG["NTFY_TOPIC"])
  3309. ntfy_tokens = parse_multi_account_config(CONFIG["NTFY_TOKEN"])
  3310. if ntfy_server_url and ntfy_topics:
  3311. # 验证 token 和 topic 数量一致(如果配置了 token)
  3312. if ntfy_tokens and len(ntfy_tokens) != len(ntfy_topics):
  3313. print(f"❌ ntfy 配置错误:topic 数量({len(ntfy_topics)})与 token 数量({len(ntfy_tokens)})不一致,跳过 ntfy 推送")
  3314. else:
  3315. ntfy_topics = limit_accounts(ntfy_topics, max_accounts, "ntfy")
  3316. if ntfy_tokens:
  3317. ntfy_tokens = ntfy_tokens[:len(ntfy_topics)]
  3318. ntfy_results = []
  3319. for i, topic in enumerate(ntfy_topics):
  3320. if topic:
  3321. token = get_account_at_index(ntfy_tokens, i, "") if ntfy_tokens else ""
  3322. account_label = f"账号{i+1}" if len(ntfy_topics) > 1 else ""
  3323. result = send_to_ntfy(
  3324. ntfy_server_url, topic, token, report_data, report_type,
  3325. update_info_to_send, proxy_url, mode, account_label
  3326. )
  3327. ntfy_results.append(result)
  3328. results["ntfy"] = any(ntfy_results) if ntfy_results else False
  3329. # 发送到 Bark(多账号)
  3330. bark_urls = parse_multi_account_config(CONFIG["BARK_URL"])
  3331. if bark_urls:
  3332. bark_urls = limit_accounts(bark_urls, max_accounts, "Bark")
  3333. bark_results = []
  3334. for i, url in enumerate(bark_urls):
  3335. if url:
  3336. account_label = f"账号{i+1}" if len(bark_urls) > 1 else ""
  3337. result = send_to_bark(
  3338. url, report_data, report_type, update_info_to_send, proxy_url, mode, account_label
  3339. )
  3340. bark_results.append(result)
  3341. results["bark"] = any(bark_results) if bark_results else False
  3342. # 发送到 Slack(多账号)
  3343. slack_urls = parse_multi_account_config(CONFIG["SLACK_WEBHOOK_URL"])
  3344. if slack_urls:
  3345. slack_urls = limit_accounts(slack_urls, max_accounts, "Slack")
  3346. slack_results = []
  3347. for i, url in enumerate(slack_urls):
  3348. if url:
  3349. account_label = f"账号{i+1}" if len(slack_urls) > 1 else ""
  3350. result = send_to_slack(
  3351. url, report_data, report_type, update_info_to_send, proxy_url, mode, account_label
  3352. )
  3353. slack_results.append(result)
  3354. results["slack"] = any(slack_results) if slack_results else False
  3355. # 发送邮件(保持原有逻辑,已支持多收件人)
  3356. email_from = CONFIG["EMAIL_FROM"]
  3357. email_password = CONFIG["EMAIL_PASSWORD"]
  3358. email_to = CONFIG["EMAIL_TO"]
  3359. email_smtp_server = CONFIG.get("EMAIL_SMTP_SERVER", "")
  3360. email_smtp_port = CONFIG.get("EMAIL_SMTP_PORT", "")
  3361. if email_from and email_password and email_to:
  3362. results["email"] = send_to_email(
  3363. email_from,
  3364. email_password,
  3365. email_to,
  3366. report_type,
  3367. html_file_path,
  3368. email_smtp_server,
  3369. email_smtp_port,
  3370. )
  3371. if not results:
  3372. print("未配置任何通知渠道,跳过通知发送")
  3373. # 如果成功发送了任何通知,且启用了每天只推一次,则记录推送
  3374. if (
  3375. CONFIG["PUSH_WINDOW"]["ENABLED"]
  3376. and CONFIG["PUSH_WINDOW"]["ONCE_PER_DAY"]
  3377. and any(results.values())
  3378. ):
  3379. push_manager = PushRecordManager()
  3380. push_manager.record_push(report_type)
  3381. return results
  3382. def send_to_feishu(
  3383. webhook_url: str,
  3384. report_data: Dict,
  3385. report_type: str,
  3386. update_info: Optional[Dict] = None,
  3387. proxy_url: Optional[str] = None,
  3388. mode: str = "daily",
  3389. account_label: str = "",
  3390. ) -> bool:
  3391. """发送到飞书(支持分批发送)"""
  3392. headers = {"Content-Type": "application/json"}
  3393. proxies = None
  3394. if proxy_url:
  3395. proxies = {"http": proxy_url, "https": proxy_url}
  3396. # 日志前缀
  3397. log_prefix = f"飞书{account_label}" if account_label else "飞书"
  3398. # 获取分批内容,使用飞书专用的批次大小
  3399. feishu_batch_size = CONFIG.get("FEISHU_BATCH_SIZE", 29000)
  3400. # 预留批次头部空间,避免添加头部后超限
  3401. header_reserve = _get_max_batch_header_size("feishu")
  3402. batches = split_content_into_batches(
  3403. report_data,
  3404. "feishu",
  3405. update_info,
  3406. max_bytes=feishu_batch_size - header_reserve,
  3407. mode=mode,
  3408. )
  3409. # 统一添加批次头部(已预留空间,不会超限)
  3410. batches = add_batch_headers(batches, "feishu", feishu_batch_size)
  3411. print(f"{log_prefix}消息分为 {len(batches)} 批次发送 [{report_type}]")
  3412. # 逐批发送
  3413. for i, batch_content in enumerate(batches, 1):
  3414. batch_size = len(batch_content.encode("utf-8"))
  3415. print(
  3416. f"发送{log_prefix}第 {i}/{len(batches)} 批次,大小:{batch_size} 字节 [{report_type}]"
  3417. )
  3418. total_titles = sum(
  3419. len(stat["titles"]) for stat in report_data["stats"] if stat["count"] > 0
  3420. )
  3421. now = get_beijing_time()
  3422. payload = {
  3423. "msg_type": "text",
  3424. "content": {
  3425. "total_titles": total_titles,
  3426. "timestamp": now.strftime("%Y-%m-%d %H:%M:%S"),
  3427. "report_type": report_type,
  3428. "text": batch_content,
  3429. },
  3430. }
  3431. try:
  3432. response = requests.post(
  3433. webhook_url, headers=headers, json=payload, proxies=proxies, timeout=30
  3434. )
  3435. if response.status_code == 200:
  3436. result = response.json()
  3437. # 检查飞书的响应状态
  3438. if result.get("StatusCode") == 0 or result.get("code") == 0:
  3439. print(f"{log_prefix}第 {i}/{len(batches)} 批次发送成功 [{report_type}]")
  3440. # 批次间间隔
  3441. if i < len(batches):
  3442. time.sleep(CONFIG["BATCH_SEND_INTERVAL"])
  3443. else:
  3444. error_msg = result.get("msg") or result.get("StatusMessage", "未知错误")
  3445. print(
  3446. f"{log_prefix}第 {i}/{len(batches)} 批次发送失败 [{report_type}],错误:{error_msg}"
  3447. )
  3448. return False
  3449. else:
  3450. print(
  3451. f"{log_prefix}第 {i}/{len(batches)} 批次发送失败 [{report_type}],状态码:{response.status_code}"
  3452. )
  3453. return False
  3454. except Exception as e:
  3455. print(f"{log_prefix}第 {i}/{len(batches)} 批次发送出错 [{report_type}]:{e}")
  3456. return False
  3457. print(f"{log_prefix}所有 {len(batches)} 批次发送完成 [{report_type}]")
  3458. return True
  3459. def send_to_dingtalk(
  3460. webhook_url: str,
  3461. report_data: Dict,
  3462. report_type: str,
  3463. update_info: Optional[Dict] = None,
  3464. proxy_url: Optional[str] = None,
  3465. mode: str = "daily",
  3466. account_label: str = "",
  3467. ) -> bool:
  3468. """发送到钉钉(支持分批发送)"""
  3469. headers = {"Content-Type": "application/json"}
  3470. proxies = None
  3471. if proxy_url:
  3472. proxies = {"http": proxy_url, "https": proxy_url}
  3473. # 日志前缀
  3474. log_prefix = f"钉钉{account_label}" if account_label else "钉钉"
  3475. # 获取分批内容,使用钉钉专用的批次大小
  3476. dingtalk_batch_size = CONFIG.get("DINGTALK_BATCH_SIZE", 20000)
  3477. # 预留批次头部空间,避免添加头部后超限
  3478. header_reserve = _get_max_batch_header_size("dingtalk")
  3479. batches = split_content_into_batches(
  3480. report_data,
  3481. "dingtalk",
  3482. update_info,
  3483. max_bytes=dingtalk_batch_size - header_reserve,
  3484. mode=mode,
  3485. )
  3486. # 统一添加批次头部(已预留空间,不会超限)
  3487. batches = add_batch_headers(batches, "dingtalk", dingtalk_batch_size)
  3488. print(f"{log_prefix}消息分为 {len(batches)} 批次发送 [{report_type}]")
  3489. # 逐批发送
  3490. for i, batch_content in enumerate(batches, 1):
  3491. batch_size = len(batch_content.encode("utf-8"))
  3492. print(
  3493. f"发送{log_prefix}第 {i}/{len(batches)} 批次,大小:{batch_size} 字节 [{report_type}]"
  3494. )
  3495. payload = {
  3496. "msgtype": "markdown",
  3497. "markdown": {
  3498. "title": f"TrendRadar 热点分析报告 - {report_type}",
  3499. "text": batch_content,
  3500. },
  3501. }
  3502. try:
  3503. response = requests.post(
  3504. webhook_url, headers=headers, json=payload, proxies=proxies, timeout=30
  3505. )
  3506. if response.status_code == 200:
  3507. result = response.json()
  3508. if result.get("errcode") == 0:
  3509. print(f"{log_prefix}第 {i}/{len(batches)} 批次发送成功 [{report_type}]")
  3510. # 批次间间隔
  3511. if i < len(batches):
  3512. time.sleep(CONFIG["BATCH_SEND_INTERVAL"])
  3513. else:
  3514. print(
  3515. f"{log_prefix}第 {i}/{len(batches)} 批次发送失败 [{report_type}],错误:{result.get('errmsg')}"
  3516. )
  3517. return False
  3518. else:
  3519. print(
  3520. f"{log_prefix}第 {i}/{len(batches)} 批次发送失败 [{report_type}],状态码:{response.status_code}"
  3521. )
  3522. return False
  3523. except Exception as e:
  3524. print(f"{log_prefix}第 {i}/{len(batches)} 批次发送出错 [{report_type}]:{e}")
  3525. return False
  3526. print(f"{log_prefix}所有 {len(batches)} 批次发送完成 [{report_type}]")
  3527. return True
  3528. def strip_markdown(text: str) -> str:
  3529. """去除文本中的 markdown 语法格式,用于个人微信推送"""
  3530. # 去除粗体 **text** 或 __text__
  3531. text = re.sub(r'\*\*(.+?)\*\*', r'\1', text)
  3532. text = re.sub(r'__(.+?)__', r'\1', text)
  3533. # 去除斜体 *text* 或 _text_
  3534. text = re.sub(r'\*(.+?)\*', r'\1', text)
  3535. text = re.sub(r'_(.+?)_', r'\1', text)
  3536. # 去除删除线 ~~text~~
  3537. text = re.sub(r'~~(.+?)~~', r'\1', text)
  3538. # 转换链接 [text](url) -> text url(保留 URL)
  3539. text = re.sub(r'\[([^\]]+)\]\(([^)]+)\)', r'\1 \2', text)
  3540. # 如果不需要保留 URL,可以使用下面这行(只保留标题文本):
  3541. # text = re.sub(r'\[([^\]]+)\]\([^)]+\)', r'\1', text)
  3542. # 去除图片 ![alt](url) -> alt
  3543. text = re.sub(r'!\[(.+?)\]\(.+?\)', r'\1', text)
  3544. # 去除行内代码 `code`
  3545. text = re.sub(r'`(.+?)`', r'\1', text)
  3546. # 去除引用符号 >
  3547. text = re.sub(r'^>\s*', '', text, flags=re.MULTILINE)
  3548. # 去除标题符号 # ## ### 等
  3549. text = re.sub(r'^#+\s*', '', text, flags=re.MULTILINE)
  3550. # 去除水平分割线 --- 或 ***
  3551. text = re.sub(r'^[\-\*]{3,}\s*$', '', text, flags=re.MULTILINE)
  3552. # 去除 HTML 标签 <font color='xxx'>text</font> -> text
  3553. text = re.sub(r'<font[^>]*>(.+?)</font>', r'\1', text)
  3554. text = re.sub(r'<[^>]+>', '', text)
  3555. # 清理多余的空行(保留最多两个连续空行)
  3556. text = re.sub(r'\n{3,}', '\n\n', text)
  3557. return text.strip()
  3558. def send_to_wework(
  3559. webhook_url: str,
  3560. report_data: Dict,
  3561. report_type: str,
  3562. update_info: Optional[Dict] = None,
  3563. proxy_url: Optional[str] = None,
  3564. mode: str = "daily",
  3565. account_label: str = "",
  3566. ) -> bool:
  3567. """发送到企业微信(支持分批发送,支持 markdown 和 text 两种格式)"""
  3568. headers = {"Content-Type": "application/json"}
  3569. proxies = None
  3570. if proxy_url:
  3571. proxies = {"http": proxy_url, "https": proxy_url}
  3572. # 日志前缀
  3573. log_prefix = f"企业微信{account_label}" if account_label else "企业微信"
  3574. # 获取消息类型配置(markdown 或 text)
  3575. msg_type = CONFIG.get("WEWORK_MSG_TYPE", "markdown").lower()
  3576. is_text_mode = msg_type == "text"
  3577. if is_text_mode:
  3578. print(f"{log_prefix}使用 text 格式(个人微信模式)[{report_type}]")
  3579. else:
  3580. print(f"{log_prefix}使用 markdown 格式(群机器人模式)[{report_type}]")
  3581. # text 模式使用 wework_text,markdown 模式使用 wework
  3582. header_format_type = "wework_text" if is_text_mode else "wework"
  3583. # 获取分批内容,预留批次头部空间
  3584. wework_batch_size = CONFIG.get("MESSAGE_BATCH_SIZE", 4000)
  3585. header_reserve = _get_max_batch_header_size(header_format_type)
  3586. batches = split_content_into_batches(
  3587. report_data, "wework", update_info, max_bytes=wework_batch_size - header_reserve, mode=mode
  3588. )
  3589. # 统一添加批次头部(已预留空间,不会超限)
  3590. batches = add_batch_headers(batches, header_format_type, wework_batch_size)
  3591. print(f"{log_prefix}消息分为 {len(batches)} 批次发送 [{report_type}]")
  3592. # 逐批发送
  3593. for i, batch_content in enumerate(batches, 1):
  3594. # 根据消息类型构建 payload
  3595. if is_text_mode:
  3596. # text 格式:去除 markdown 语法
  3597. plain_content = strip_markdown(batch_content)
  3598. payload = {"msgtype": "text", "text": {"content": plain_content}}
  3599. batch_size = len(plain_content.encode("utf-8"))
  3600. else:
  3601. # markdown 格式:保持原样
  3602. payload = {"msgtype": "markdown", "markdown": {"content": batch_content}}
  3603. batch_size = len(batch_content.encode("utf-8"))
  3604. print(
  3605. f"发送{log_prefix}第 {i}/{len(batches)} 批次,大小:{batch_size} 字节 [{report_type}]"
  3606. )
  3607. try:
  3608. response = requests.post(
  3609. webhook_url, headers=headers, json=payload, proxies=proxies, timeout=30
  3610. )
  3611. if response.status_code == 200:
  3612. result = response.json()
  3613. if result.get("errcode") == 0:
  3614. print(f"{log_prefix}第 {i}/{len(batches)} 批次发送成功 [{report_type}]")
  3615. # 批次间间隔
  3616. if i < len(batches):
  3617. time.sleep(CONFIG["BATCH_SEND_INTERVAL"])
  3618. else:
  3619. print(
  3620. f"{log_prefix}第 {i}/{len(batches)} 批次发送失败 [{report_type}],错误:{result.get('errmsg')}"
  3621. )
  3622. return False
  3623. else:
  3624. print(
  3625. f"{log_prefix}第 {i}/{len(batches)} 批次发送失败 [{report_type}],状态码:{response.status_code}"
  3626. )
  3627. return False
  3628. except Exception as e:
  3629. print(f"{log_prefix}第 {i}/{len(batches)} 批次发送出错 [{report_type}]:{e}")
  3630. return False
  3631. print(f"{log_prefix}所有 {len(batches)} 批次发送完成 [{report_type}]")
  3632. return True
  3633. def send_to_telegram(
  3634. bot_token: str,
  3635. chat_id: str,
  3636. report_data: Dict,
  3637. report_type: str,
  3638. update_info: Optional[Dict] = None,
  3639. proxy_url: Optional[str] = None,
  3640. mode: str = "daily",
  3641. account_label: str = "",
  3642. ) -> bool:
  3643. """发送到Telegram(支持分批发送)"""
  3644. headers = {"Content-Type": "application/json"}
  3645. url = f"https://api.telegram.org/bot{bot_token}/sendMessage"
  3646. proxies = None
  3647. if proxy_url:
  3648. proxies = {"http": proxy_url, "https": proxy_url}
  3649. # 日志前缀
  3650. log_prefix = f"Telegram{account_label}" if account_label else "Telegram"
  3651. # 获取分批内容,预留批次头部空间
  3652. telegram_batch_size = CONFIG.get("MESSAGE_BATCH_SIZE", 4000)
  3653. header_reserve = _get_max_batch_header_size("telegram")
  3654. batches = split_content_into_batches(
  3655. report_data, "telegram", update_info, max_bytes=telegram_batch_size - header_reserve, mode=mode
  3656. )
  3657. # 统一添加批次头部(已预留空间,不会超限)
  3658. batches = add_batch_headers(batches, "telegram", telegram_batch_size)
  3659. print(f"{log_prefix}消息分为 {len(batches)} 批次发送 [{report_type}]")
  3660. # 逐批发送
  3661. for i, batch_content in enumerate(batches, 1):
  3662. batch_size = len(batch_content.encode("utf-8"))
  3663. print(
  3664. f"发送{log_prefix}第 {i}/{len(batches)} 批次,大小:{batch_size} 字节 [{report_type}]"
  3665. )
  3666. payload = {
  3667. "chat_id": chat_id,
  3668. "text": batch_content,
  3669. "parse_mode": "HTML",
  3670. "disable_web_page_preview": True,
  3671. }
  3672. try:
  3673. response = requests.post(
  3674. url, headers=headers, json=payload, proxies=proxies, timeout=30
  3675. )
  3676. if response.status_code == 200:
  3677. result = response.json()
  3678. if result.get("ok"):
  3679. print(f"{log_prefix}第 {i}/{len(batches)} 批次发送成功 [{report_type}]")
  3680. # 批次间间隔
  3681. if i < len(batches):
  3682. time.sleep(CONFIG["BATCH_SEND_INTERVAL"])
  3683. else:
  3684. print(
  3685. f"{log_prefix}第 {i}/{len(batches)} 批次发送失败 [{report_type}],错误:{result.get('description')}"
  3686. )
  3687. return False
  3688. else:
  3689. print(
  3690. f"{log_prefix}第 {i}/{len(batches)} 批次发送失败 [{report_type}],状态码:{response.status_code}"
  3691. )
  3692. return False
  3693. except Exception as e:
  3694. print(f"{log_prefix}第 {i}/{len(batches)} 批次发送出错 [{report_type}]:{e}")
  3695. return False
  3696. print(f"{log_prefix}所有 {len(batches)} 批次发送完成 [{report_type}]")
  3697. return True
  3698. def send_to_email(
  3699. from_email: str,
  3700. password: str,
  3701. to_email: str,
  3702. report_type: str,
  3703. html_file_path: str,
  3704. custom_smtp_server: Optional[str] = None,
  3705. custom_smtp_port: Optional[int] = None,
  3706. ) -> bool:
  3707. """发送邮件通知"""
  3708. try:
  3709. if not html_file_path or not Path(html_file_path).exists():
  3710. print(f"错误:HTML文件不存在或未提供: {html_file_path}")
  3711. return False
  3712. print(f"使用HTML文件: {html_file_path}")
  3713. with open(html_file_path, "r", encoding="utf-8") as f:
  3714. html_content = f.read()
  3715. domain = from_email.split("@")[-1].lower()
  3716. if custom_smtp_server and custom_smtp_port:
  3717. # 使用自定义 SMTP 配置
  3718. smtp_server = custom_smtp_server
  3719. smtp_port = int(custom_smtp_port)
  3720. # 根据端口判断加密方式:465=SSL, 587=TLS
  3721. if smtp_port == 465:
  3722. use_tls = False # SSL 模式(SMTP_SSL)
  3723. elif smtp_port == 587:
  3724. use_tls = True # TLS 模式(STARTTLS)
  3725. else:
  3726. # 其他端口优先尝试 TLS(更安全,更广泛支持)
  3727. use_tls = True
  3728. elif domain in SMTP_CONFIGS:
  3729. # 使用预设配置
  3730. config = SMTP_CONFIGS[domain]
  3731. smtp_server = config["server"]
  3732. smtp_port = config["port"]
  3733. use_tls = config["encryption"] == "TLS"
  3734. else:
  3735. print(f"未识别的邮箱服务商: {domain},使用通用 SMTP 配置")
  3736. smtp_server = f"smtp.{domain}"
  3737. smtp_port = 587
  3738. use_tls = True
  3739. msg = MIMEMultipart("alternative")
  3740. # 严格按照 RFC 标准设置 From header
  3741. sender_name = "TrendRadar"
  3742. msg["From"] = formataddr((sender_name, from_email))
  3743. # 设置收件人
  3744. recipients = [addr.strip() for addr in to_email.split(",")]
  3745. if len(recipients) == 1:
  3746. msg["To"] = recipients[0]
  3747. else:
  3748. msg["To"] = ", ".join(recipients)
  3749. # 设置邮件主题
  3750. now = get_beijing_time()
  3751. subject = f"TrendRadar 热点分析报告 - {report_type} - {now.strftime('%m月%d日 %H:%M')}"
  3752. msg["Subject"] = Header(subject, "utf-8")
  3753. # 设置其他标准 header
  3754. msg["MIME-Version"] = "1.0"
  3755. msg["Date"] = formatdate(localtime=True)
  3756. msg["Message-ID"] = make_msgid()
  3757. # 添加纯文本部分(作为备选)
  3758. text_content = f"""
  3759. TrendRadar 热点分析报告
  3760. ========================
  3761. 报告类型:{report_type}
  3762. 生成时间:{now.strftime('%Y-%m-%d %H:%M:%S')}
  3763. 请使用支持HTML的邮件客户端查看完整报告内容。
  3764. """
  3765. text_part = MIMEText(text_content, "plain", "utf-8")
  3766. msg.attach(text_part)
  3767. html_part = MIMEText(html_content, "html", "utf-8")
  3768. msg.attach(html_part)
  3769. print(f"正在发送邮件到 {to_email}...")
  3770. print(f"SMTP 服务器: {smtp_server}:{smtp_port}")
  3771. print(f"发件人: {from_email}")
  3772. try:
  3773. if use_tls:
  3774. # TLS 模式
  3775. server = smtplib.SMTP(smtp_server, smtp_port, timeout=30)
  3776. server.set_debuglevel(0) # 设为1可以查看详细调试信息
  3777. server.ehlo()
  3778. server.starttls()
  3779. server.ehlo()
  3780. else:
  3781. # SSL 模式
  3782. server = smtplib.SMTP_SSL(smtp_server, smtp_port, timeout=30)
  3783. server.set_debuglevel(0)
  3784. server.ehlo()
  3785. # 登录
  3786. server.login(from_email, password)
  3787. # 发送邮件
  3788. server.send_message(msg)
  3789. server.quit()
  3790. print(f"邮件发送成功 [{report_type}] -> {to_email}")
  3791. return True
  3792. except smtplib.SMTPServerDisconnected:
  3793. print(f"邮件发送失败:服务器意外断开连接,请检查网络或稍后重试")
  3794. return False
  3795. except smtplib.SMTPAuthenticationError as e:
  3796. print(f"邮件发送失败:认证错误,请检查邮箱和密码/授权码")
  3797. print(f"详细错误: {str(e)}")
  3798. return False
  3799. except smtplib.SMTPRecipientsRefused as e:
  3800. print(f"邮件发送失败:收件人地址被拒绝 {e}")
  3801. return False
  3802. except smtplib.SMTPSenderRefused as e:
  3803. print(f"邮件发送失败:发件人地址被拒绝 {e}")
  3804. return False
  3805. except smtplib.SMTPDataError as e:
  3806. print(f"邮件发送失败:邮件数据错误 {e}")
  3807. return False
  3808. except smtplib.SMTPConnectError as e:
  3809. print(f"邮件发送失败:无法连接到 SMTP 服务器 {smtp_server}:{smtp_port}")
  3810. print(f"详细错误: {str(e)}")
  3811. return False
  3812. except Exception as e:
  3813. print(f"邮件发送失败 [{report_type}]:{e}")
  3814. import traceback
  3815. traceback.print_exc()
  3816. return False
  3817. def send_to_ntfy(
  3818. server_url: str,
  3819. topic: str,
  3820. token: Optional[str],
  3821. report_data: Dict,
  3822. report_type: str,
  3823. update_info: Optional[Dict] = None,
  3824. proxy_url: Optional[str] = None,
  3825. mode: str = "daily",
  3826. account_label: str = "",
  3827. ) -> bool:
  3828. """发送到ntfy(支持分批发送,严格遵守4KB限制)"""
  3829. # 日志前缀
  3830. log_prefix = f"ntfy{account_label}" if account_label else "ntfy"
  3831. # 避免 HTTP header 编码问题
  3832. report_type_en_map = {
  3833. "当日汇总": "Daily Summary",
  3834. "当前榜单汇总": "Current Ranking",
  3835. "增量更新": "Incremental Update",
  3836. "实时增量": "Realtime Incremental",
  3837. "实时当前榜单": "Realtime Current Ranking",
  3838. }
  3839. report_type_en = report_type_en_map.get(report_type, "News Report")
  3840. headers = {
  3841. "Content-Type": "text/plain; charset=utf-8",
  3842. "Markdown": "yes",
  3843. "Title": report_type_en,
  3844. "Priority": "default",
  3845. "Tags": "news",
  3846. }
  3847. if token:
  3848. headers["Authorization"] = f"Bearer {token}"
  3849. # 构建完整URL,确保格式正确
  3850. base_url = server_url.rstrip("/")
  3851. if not base_url.startswith(("http://", "https://")):
  3852. base_url = f"https://{base_url}"
  3853. url = f"{base_url}/{topic}"
  3854. proxies = None
  3855. if proxy_url:
  3856. proxies = {"http": proxy_url, "https": proxy_url}
  3857. # 获取分批内容,使用ntfy专用的4KB限制,预留批次头部空间
  3858. ntfy_batch_size = 3800
  3859. header_reserve = _get_max_batch_header_size("ntfy")
  3860. batches = split_content_into_batches(
  3861. report_data, "ntfy", update_info, max_bytes=ntfy_batch_size - header_reserve, mode=mode
  3862. )
  3863. # 统一添加批次头部(已预留空间,不会超限)
  3864. batches = add_batch_headers(batches, "ntfy", ntfy_batch_size)
  3865. total_batches = len(batches)
  3866. print(f"{log_prefix}消息分为 {total_batches} 批次发送 [{report_type}]")
  3867. # 反转批次顺序,使得在ntfy客户端显示时顺序正确
  3868. # ntfy显示最新消息在上面,所以我们从最后一批开始推送
  3869. reversed_batches = list(reversed(batches))
  3870. print(f"{log_prefix}将按反向顺序推送(最后批次先推送),确保客户端显示顺序正确")
  3871. # 逐批发送(反向顺序)
  3872. success_count = 0
  3873. for idx, batch_content in enumerate(reversed_batches, 1):
  3874. # 计算正确的批次编号(用户视角的编号)
  3875. actual_batch_num = total_batches - idx + 1
  3876. batch_size = len(batch_content.encode("utf-8"))
  3877. print(
  3878. f"发送{log_prefix}第 {actual_batch_num}/{total_batches} 批次(推送顺序: {idx}/{total_batches}),大小:{batch_size} 字节 [{report_type}]"
  3879. )
  3880. # 检查消息大小,确保不超过4KB
  3881. if batch_size > 4096:
  3882. print(f"警告:{log_prefix}第 {actual_batch_num} 批次消息过大({batch_size} 字节),可能被拒绝")
  3883. # 更新 headers 的批次标识
  3884. current_headers = headers.copy()
  3885. if total_batches > 1:
  3886. current_headers["Title"] = (
  3887. f"{report_type_en} ({actual_batch_num}/{total_batches})"
  3888. )
  3889. try:
  3890. response = requests.post(
  3891. url,
  3892. headers=current_headers,
  3893. data=batch_content.encode("utf-8"),
  3894. proxies=proxies,
  3895. timeout=30,
  3896. )
  3897. if response.status_code == 200:
  3898. print(f"{log_prefix}第 {actual_batch_num}/{total_batches} 批次发送成功 [{report_type}]")
  3899. success_count += 1
  3900. if idx < total_batches:
  3901. # 公共服务器建议 2-3 秒,自托管可以更短
  3902. interval = 2 if "ntfy.sh" in server_url else 1
  3903. time.sleep(interval)
  3904. elif response.status_code == 429:
  3905. print(
  3906. f"{log_prefix}第 {actual_batch_num}/{total_batches} 批次速率限制 [{report_type}],等待后重试"
  3907. )
  3908. time.sleep(10) # 等待10秒后重试
  3909. # 重试一次
  3910. retry_response = requests.post(
  3911. url,
  3912. headers=current_headers,
  3913. data=batch_content.encode("utf-8"),
  3914. proxies=proxies,
  3915. timeout=30,
  3916. )
  3917. if retry_response.status_code == 200:
  3918. print(f"{log_prefix}第 {actual_batch_num}/{total_batches} 批次重试成功 [{report_type}]")
  3919. success_count += 1
  3920. else:
  3921. print(
  3922. f"{log_prefix}第 {actual_batch_num}/{total_batches} 批次重试失败,状态码:{retry_response.status_code}"
  3923. )
  3924. elif response.status_code == 413:
  3925. print(
  3926. f"{log_prefix}第 {actual_batch_num}/{total_batches} 批次消息过大被拒绝 [{report_type}],消息大小:{batch_size} 字节"
  3927. )
  3928. else:
  3929. print(
  3930. f"{log_prefix}第 {actual_batch_num}/{total_batches} 批次发送失败 [{report_type}],状态码:{response.status_code}"
  3931. )
  3932. try:
  3933. print(f"错误详情:{response.text}")
  3934. except:
  3935. pass
  3936. except requests.exceptions.ConnectTimeout:
  3937. print(f"{log_prefix}第 {actual_batch_num}/{total_batches} 批次连接超时 [{report_type}]")
  3938. except requests.exceptions.ReadTimeout:
  3939. print(f"{log_prefix}第 {actual_batch_num}/{total_batches} 批次读取超时 [{report_type}]")
  3940. except requests.exceptions.ConnectionError as e:
  3941. print(f"{log_prefix}第 {actual_batch_num}/{total_batches} 批次连接错误 [{report_type}]:{e}")
  3942. except Exception as e:
  3943. print(f"{log_prefix}第 {actual_batch_num}/{total_batches} 批次发送异常 [{report_type}]:{e}")
  3944. # 判断整体发送是否成功
  3945. if success_count == total_batches:
  3946. print(f"{log_prefix}所有 {total_batches} 批次发送完成 [{report_type}]")
  3947. return True
  3948. elif success_count > 0:
  3949. print(f"{log_prefix}部分发送成功:{success_count}/{total_batches} 批次 [{report_type}]")
  3950. return True # 部分成功也视为成功
  3951. else:
  3952. print(f"{log_prefix}发送完全失败 [{report_type}]")
  3953. return False
  3954. def send_to_bark(
  3955. bark_url: str,
  3956. report_data: Dict,
  3957. report_type: str,
  3958. update_info: Optional[Dict] = None,
  3959. proxy_url: Optional[str] = None,
  3960. mode: str = "daily",
  3961. account_label: str = "",
  3962. ) -> bool:
  3963. """发送到Bark(支持分批发送,使用 markdown 格式)"""
  3964. # 日志前缀
  3965. log_prefix = f"Bark{account_label}" if account_label else "Bark"
  3966. proxies = None
  3967. if proxy_url:
  3968. proxies = {"http": proxy_url, "https": proxy_url}
  3969. # 解析 Bark URL,提取 device_key 和 API 端点
  3970. # Bark URL 格式: https://api.day.app/device_key 或 https://bark.day.app/device_key
  3971. from urllib.parse import urlparse
  3972. parsed_url = urlparse(bark_url)
  3973. device_key = parsed_url.path.strip('/').split('/')[0] if parsed_url.path else None
  3974. if not device_key:
  3975. print(f"{log_prefix} URL 格式错误,无法提取 device_key: {bark_url}")
  3976. return False
  3977. # 构建正确的 API 端点
  3978. api_endpoint = f"{parsed_url.scheme}://{parsed_url.netloc}/push"
  3979. # 获取分批内容(Bark 限制为 3600 字节以避免 413 错误),预留批次头部空间
  3980. bark_batch_size = CONFIG["BARK_BATCH_SIZE"]
  3981. header_reserve = _get_max_batch_header_size("bark")
  3982. batches = split_content_into_batches(
  3983. report_data, "bark", update_info, max_bytes=bark_batch_size - header_reserve, mode=mode
  3984. )
  3985. # 统一添加批次头部(已预留空间,不会超限)
  3986. batches = add_batch_headers(batches, "bark", bark_batch_size)
  3987. total_batches = len(batches)
  3988. print(f"{log_prefix}消息分为 {total_batches} 批次发送 [{report_type}]")
  3989. # 反转批次顺序,使得在Bark客户端显示时顺序正确
  3990. # Bark显示最新消息在上面,所以我们从最后一批开始推送
  3991. reversed_batches = list(reversed(batches))
  3992. print(f"{log_prefix}将按反向顺序推送(最后批次先推送),确保客户端显示顺序正确")
  3993. # 逐批发送(反向顺序)
  3994. success_count = 0
  3995. for idx, batch_content in enumerate(reversed_batches, 1):
  3996. # 计算正确的批次编号(用户视角的编号)
  3997. actual_batch_num = total_batches - idx + 1
  3998. batch_size = len(batch_content.encode("utf-8"))
  3999. print(
  4000. f"发送{log_prefix}第 {actual_batch_num}/{total_batches} 批次(推送顺序: {idx}/{total_batches}),大小:{batch_size} 字节 [{report_type}]"
  4001. )
  4002. # 检查消息大小(Bark使用APNs,限制4KB)
  4003. if batch_size > 4096:
  4004. print(
  4005. f"警告:{log_prefix}第 {actual_batch_num}/{total_batches} 批次消息过大({batch_size} 字节),可能被拒绝"
  4006. )
  4007. # 构建JSON payload
  4008. payload = {
  4009. "title": report_type,
  4010. "markdown": batch_content,
  4011. "device_key": device_key,
  4012. "sound": "default",
  4013. "group": "TrendRadar",
  4014. "action": "none", # 点击推送跳到 APP 不弹出弹框,方便阅读
  4015. }
  4016. try:
  4017. response = requests.post(
  4018. api_endpoint,
  4019. json=payload,
  4020. proxies=proxies,
  4021. timeout=30,
  4022. )
  4023. if response.status_code == 200:
  4024. result = response.json()
  4025. if result.get("code") == 200:
  4026. print(f"{log_prefix}第 {actual_batch_num}/{total_batches} 批次发送成功 [{report_type}]")
  4027. success_count += 1
  4028. # 批次间间隔
  4029. if idx < total_batches:
  4030. time.sleep(CONFIG["BATCH_SEND_INTERVAL"])
  4031. else:
  4032. print(
  4033. f"{log_prefix}第 {actual_batch_num}/{total_batches} 批次发送失败 [{report_type}],错误:{result.get('message', '未知错误')}"
  4034. )
  4035. else:
  4036. print(
  4037. f"{log_prefix}第 {actual_batch_num}/{total_batches} 批次发送失败 [{report_type}],状态码:{response.status_code}"
  4038. )
  4039. try:
  4040. print(f"错误详情:{response.text}")
  4041. except:
  4042. pass
  4043. except requests.exceptions.ConnectTimeout:
  4044. print(f"{log_prefix}第 {actual_batch_num}/{total_batches} 批次连接超时 [{report_type}]")
  4045. except requests.exceptions.ReadTimeout:
  4046. print(f"{log_prefix}第 {actual_batch_num}/{total_batches} 批次读取超时 [{report_type}]")
  4047. except requests.exceptions.ConnectionError as e:
  4048. print(f"{log_prefix}第 {actual_batch_num}/{total_batches} 批次连接错误 [{report_type}]:{e}")
  4049. except Exception as e:
  4050. print(f"{log_prefix}第 {actual_batch_num}/{total_batches} 批次发送异常 [{report_type}]:{e}")
  4051. # 判断整体发送是否成功
  4052. if success_count == total_batches:
  4053. print(f"{log_prefix}所有 {total_batches} 批次发送完成 [{report_type}]")
  4054. return True
  4055. elif success_count > 0:
  4056. print(f"{log_prefix}部分发送成功:{success_count}/{total_batches} 批次 [{report_type}]")
  4057. return True # 部分成功也视为成功
  4058. else:
  4059. print(f"{log_prefix}发送完全失败 [{report_type}]")
  4060. return False
  4061. def convert_markdown_to_mrkdwn(content: str) -> str:
  4062. """
  4063. 将标准 Markdown 转换为 Slack 的 mrkdwn 格式
  4064. 转换规则:
  4065. - **粗体** → *粗体*
  4066. - [文本](url) → <url|文本>
  4067. - 保留其他格式(代码块、列表等)
  4068. """
  4069. # 1. 转换链接格式: [文本](url) → <url|文本>
  4070. content = re.sub(r'\[([^\]]+)\]\(([^)]+)\)', r'<\2|\1>', content)
  4071. # 2. 转换粗体: **文本** → *文本*
  4072. content = re.sub(r'\*\*([^*]+)\*\*', r'*\1*', content)
  4073. return content
  4074. def send_to_slack(
  4075. webhook_url: str,
  4076. report_data: Dict,
  4077. report_type: str,
  4078. update_info: Optional[Dict] = None,
  4079. proxy_url: Optional[str] = None,
  4080. mode: str = "daily",
  4081. account_label: str = "",
  4082. ) -> bool:
  4083. """发送到Slack(支持分批发送,使用 mrkdwn 格式)"""
  4084. headers = {"Content-Type": "application/json"}
  4085. proxies = None
  4086. if proxy_url:
  4087. proxies = {"http": proxy_url, "https": proxy_url}
  4088. # 日志前缀
  4089. log_prefix = f"Slack{account_label}" if account_label else "Slack"
  4090. # 获取分批内容(使用 Slack 批次大小),预留批次头部空间
  4091. slack_batch_size = CONFIG["SLACK_BATCH_SIZE"]
  4092. header_reserve = _get_max_batch_header_size("slack")
  4093. batches = split_content_into_batches(
  4094. report_data, "slack", update_info, max_bytes=slack_batch_size - header_reserve, mode=mode
  4095. )
  4096. # 统一添加批次头部(已预留空间,不会超限)
  4097. batches = add_batch_headers(batches, "slack", slack_batch_size)
  4098. print(f"{log_prefix}消息分为 {len(batches)} 批次发送 [{report_type}]")
  4099. # 逐批发送
  4100. for i, batch_content in enumerate(batches, 1):
  4101. # 转换 Markdown 到 mrkdwn 格式
  4102. mrkdwn_content = convert_markdown_to_mrkdwn(batch_content)
  4103. batch_size = len(mrkdwn_content.encode("utf-8"))
  4104. print(
  4105. f"发送{log_prefix}第 {i}/{len(batches)} 批次,大小:{batch_size} 字节 [{report_type}]"
  4106. )
  4107. # 构建 Slack payload(使用简单的 text 字段,支持 mrkdwn)
  4108. payload = {
  4109. "text": mrkdwn_content
  4110. }
  4111. try:
  4112. response = requests.post(
  4113. webhook_url, headers=headers, json=payload, proxies=proxies, timeout=30
  4114. )
  4115. # Slack Incoming Webhooks 成功时返回 "ok" 文本
  4116. if response.status_code == 200 and response.text == "ok":
  4117. print(f"{log_prefix}第 {i}/{len(batches)} 批次发送成功 [{report_type}]")
  4118. # 批次间间隔
  4119. if i < len(batches):
  4120. time.sleep(CONFIG["BATCH_SEND_INTERVAL"])
  4121. else:
  4122. error_msg = response.text if response.text else f"状态码:{response.status_code}"
  4123. print(
  4124. f"{log_prefix}第 {i}/{len(batches)} 批次发送失败 [{report_type}],错误:{error_msg}"
  4125. )
  4126. return False
  4127. except Exception as e:
  4128. print(f"{log_prefix}第 {i}/{len(batches)} 批次发送出错 [{report_type}]:{e}")
  4129. return False
  4130. print(f"{log_prefix}所有 {len(batches)} 批次发送完成 [{report_type}]")
  4131. return True
  4132. # === 主分析器 ===
  4133. class NewsAnalyzer:
  4134. """新闻分析器"""
  4135. # 模式策略定义
  4136. MODE_STRATEGIES = {
  4137. "incremental": {
  4138. "mode_name": "增量模式",
  4139. "description": "增量模式(只关注新增新闻,无新增时不推送)",
  4140. "realtime_report_type": "实时增量",
  4141. "summary_report_type": "当日汇总",
  4142. "should_send_realtime": True,
  4143. "should_generate_summary": True,
  4144. "summary_mode": "daily",
  4145. },
  4146. "current": {
  4147. "mode_name": "当前榜单模式",
  4148. "description": "当前榜单模式(当前榜单匹配新闻 + 新增新闻区域 + 按时推送)",
  4149. "realtime_report_type": "实时当前榜单",
  4150. "summary_report_type": "当前榜单汇总",
  4151. "should_send_realtime": True,
  4152. "should_generate_summary": True,
  4153. "summary_mode": "current",
  4154. },
  4155. "daily": {
  4156. "mode_name": "当日汇总模式",
  4157. "description": "当日汇总模式(所有匹配新闻 + 新增新闻区域 + 按时推送)",
  4158. "realtime_report_type": "",
  4159. "summary_report_type": "当日汇总",
  4160. "should_send_realtime": False,
  4161. "should_generate_summary": True,
  4162. "summary_mode": "daily",
  4163. },
  4164. }
  4165. def __init__(self):
  4166. self.request_interval = CONFIG["REQUEST_INTERVAL"]
  4167. self.report_mode = CONFIG["REPORT_MODE"]
  4168. self.rank_threshold = CONFIG["RANK_THRESHOLD"]
  4169. self.is_github_actions = os.environ.get("GITHUB_ACTIONS") == "true"
  4170. self.is_docker_container = self._detect_docker_environment()
  4171. self.update_info = None
  4172. self.proxy_url = None
  4173. self._setup_proxy()
  4174. self.data_fetcher = DataFetcher(self.proxy_url)
  4175. if self.is_github_actions:
  4176. self._check_version_update()
  4177. def _detect_docker_environment(self) -> bool:
  4178. """检测是否运行在 Docker 容器中"""
  4179. try:
  4180. if os.environ.get("DOCKER_CONTAINER") == "true":
  4181. return True
  4182. if os.path.exists("/.dockerenv"):
  4183. return True
  4184. return False
  4185. except Exception:
  4186. return False
  4187. def _should_open_browser(self) -> bool:
  4188. """判断是否应该打开浏览器"""
  4189. return not self.is_github_actions and not self.is_docker_container
  4190. def _setup_proxy(self) -> None:
  4191. """设置代理配置"""
  4192. if not self.is_github_actions and CONFIG["USE_PROXY"]:
  4193. self.proxy_url = CONFIG["DEFAULT_PROXY"]
  4194. print("本地环境,使用代理")
  4195. elif not self.is_github_actions and not CONFIG["USE_PROXY"]:
  4196. print("本地环境,未启用代理")
  4197. else:
  4198. print("GitHub Actions环境,不使用代理")
  4199. def _check_version_update(self) -> None:
  4200. """检查版本更新"""
  4201. try:
  4202. need_update, remote_version = check_version_update(
  4203. VERSION, CONFIG["VERSION_CHECK_URL"], self.proxy_url
  4204. )
  4205. if need_update and remote_version:
  4206. self.update_info = {
  4207. "current_version": VERSION,
  4208. "remote_version": remote_version,
  4209. }
  4210. print(f"发现新版本: {remote_version} (当前: {VERSION})")
  4211. else:
  4212. print("版本检查完成,当前为最新版本")
  4213. except Exception as e:
  4214. print(f"版本检查出错: {e}")
  4215. def _get_mode_strategy(self) -> Dict:
  4216. """获取当前模式的策略配置"""
  4217. return self.MODE_STRATEGIES.get(self.report_mode, self.MODE_STRATEGIES["daily"])
  4218. def _has_notification_configured(self) -> bool:
  4219. """检查是否配置了任何通知渠道"""
  4220. return any(
  4221. [
  4222. CONFIG["FEISHU_WEBHOOK_URL"],
  4223. CONFIG["DINGTALK_WEBHOOK_URL"],
  4224. CONFIG["WEWORK_WEBHOOK_URL"],
  4225. (CONFIG["TELEGRAM_BOT_TOKEN"] and CONFIG["TELEGRAM_CHAT_ID"]),
  4226. (
  4227. CONFIG["EMAIL_FROM"]
  4228. and CONFIG["EMAIL_PASSWORD"]
  4229. and CONFIG["EMAIL_TO"]
  4230. ),
  4231. (CONFIG["NTFY_SERVER_URL"] and CONFIG["NTFY_TOPIC"]),
  4232. CONFIG["BARK_URL"],
  4233. CONFIG["SLACK_WEBHOOK_URL"],
  4234. ]
  4235. )
  4236. def _has_valid_content(
  4237. self, stats: List[Dict], new_titles: Optional[Dict] = None
  4238. ) -> bool:
  4239. """检查是否有有效的新闻内容"""
  4240. if self.report_mode in ["incremental", "current"]:
  4241. # 增量模式和current模式下,只要stats有内容就说明有匹配的新闻
  4242. return any(stat["count"] > 0 for stat in stats)
  4243. else:
  4244. # 当日汇总模式下,检查是否有匹配的频率词新闻或新增新闻
  4245. has_matched_news = any(stat["count"] > 0 for stat in stats)
  4246. has_new_news = bool(
  4247. new_titles and any(len(titles) > 0 for titles in new_titles.values())
  4248. )
  4249. return has_matched_news or has_new_news
  4250. def _load_analysis_data(
  4251. self,
  4252. ) -> Optional[Tuple[Dict, Dict, Dict, Dict, List, List]]:
  4253. """统一的数据加载和预处理,使用当前监控平台列表过滤历史数据"""
  4254. try:
  4255. # 获取当前配置的监控平台ID列表
  4256. current_platform_ids = []
  4257. for platform in CONFIG["PLATFORMS"]:
  4258. current_platform_ids.append(platform["id"])
  4259. print(f"当前监控平台: {current_platform_ids}")
  4260. all_results, id_to_name, title_info = read_all_today_titles(
  4261. current_platform_ids
  4262. )
  4263. if not all_results:
  4264. print("没有找到当天的数据")
  4265. return None
  4266. total_titles = sum(len(titles) for titles in all_results.values())
  4267. print(f"读取到 {total_titles} 个标题(已按当前监控平台过滤)")
  4268. new_titles = detect_latest_new_titles(current_platform_ids)
  4269. word_groups, filter_words, global_filters = load_frequency_words()
  4270. return (
  4271. all_results,
  4272. id_to_name,
  4273. title_info,
  4274. new_titles,
  4275. word_groups,
  4276. filter_words,
  4277. global_filters,
  4278. )
  4279. except Exception as e:
  4280. print(f"数据加载失败: {e}")
  4281. return None
  4282. def _prepare_current_title_info(self, results: Dict, time_info: str) -> Dict:
  4283. """从当前抓取结果构建标题信息"""
  4284. title_info = {}
  4285. for source_id, titles_data in results.items():
  4286. title_info[source_id] = {}
  4287. for title, title_data in titles_data.items():
  4288. ranks = title_data.get("ranks", [])
  4289. url = title_data.get("url", "")
  4290. mobile_url = title_data.get("mobileUrl", "")
  4291. title_info[source_id][title] = {
  4292. "first_time": time_info,
  4293. "last_time": time_info,
  4294. "count": 1,
  4295. "ranks": ranks,
  4296. "url": url,
  4297. "mobileUrl": mobile_url,
  4298. }
  4299. return title_info
  4300. def _run_analysis_pipeline(
  4301. self,
  4302. data_source: Dict,
  4303. mode: str,
  4304. title_info: Dict,
  4305. new_titles: Dict,
  4306. word_groups: List[Dict],
  4307. filter_words: List[str],
  4308. id_to_name: Dict,
  4309. failed_ids: Optional[List] = None,
  4310. is_daily_summary: bool = False,
  4311. global_filters: Optional[List[str]] = None,
  4312. ) -> Tuple[List[Dict], str]:
  4313. """统一的分析流水线:数据处理 → 统计计算 → HTML生成"""
  4314. # 统计计算
  4315. stats, total_titles = count_word_frequency(
  4316. data_source,
  4317. word_groups,
  4318. filter_words,
  4319. id_to_name,
  4320. title_info,
  4321. self.rank_threshold,
  4322. new_titles,
  4323. mode=mode,
  4324. global_filters=global_filters,
  4325. )
  4326. # HTML生成
  4327. html_file = generate_html_report(
  4328. stats,
  4329. total_titles,
  4330. failed_ids=failed_ids,
  4331. new_titles=new_titles,
  4332. id_to_name=id_to_name,
  4333. mode=mode,
  4334. is_daily_summary=is_daily_summary,
  4335. update_info=self.update_info if CONFIG["SHOW_VERSION_UPDATE"] else None,
  4336. )
  4337. return stats, html_file
  4338. def _send_notification_if_needed(
  4339. self,
  4340. stats: List[Dict],
  4341. report_type: str,
  4342. mode: str,
  4343. failed_ids: Optional[List] = None,
  4344. new_titles: Optional[Dict] = None,
  4345. id_to_name: Optional[Dict] = None,
  4346. html_file_path: Optional[str] = None,
  4347. ) -> bool:
  4348. """统一的通知发送逻辑,包含所有判断条件"""
  4349. has_notification = self._has_notification_configured()
  4350. if (
  4351. CONFIG["ENABLE_NOTIFICATION"]
  4352. and has_notification
  4353. and self._has_valid_content(stats, new_titles)
  4354. ):
  4355. send_to_notifications(
  4356. stats,
  4357. failed_ids or [],
  4358. report_type,
  4359. new_titles,
  4360. id_to_name,
  4361. self.update_info,
  4362. self.proxy_url,
  4363. mode=mode,
  4364. html_file_path=html_file_path,
  4365. )
  4366. return True
  4367. elif CONFIG["ENABLE_NOTIFICATION"] and not has_notification:
  4368. print("⚠️ 警告:通知功能已启用但未配置任何通知渠道,将跳过通知发送")
  4369. elif not CONFIG["ENABLE_NOTIFICATION"]:
  4370. print(f"跳过{report_type}通知:通知功能已禁用")
  4371. elif (
  4372. CONFIG["ENABLE_NOTIFICATION"]
  4373. and has_notification
  4374. and not self._has_valid_content(stats, new_titles)
  4375. ):
  4376. mode_strategy = self._get_mode_strategy()
  4377. if "实时" in report_type:
  4378. print(
  4379. f"跳过实时推送通知:{mode_strategy['mode_name']}下未检测到匹配的新闻"
  4380. )
  4381. else:
  4382. print(
  4383. f"跳过{mode_strategy['summary_report_type']}通知:未匹配到有效的新闻内容"
  4384. )
  4385. return False
  4386. def _generate_summary_report(self, mode_strategy: Dict) -> Optional[str]:
  4387. """生成汇总报告(带通知)"""
  4388. summary_type = (
  4389. "当前榜单汇总" if mode_strategy["summary_mode"] == "current" else "当日汇总"
  4390. )
  4391. print(f"生成{summary_type}报告...")
  4392. # 加载分析数据
  4393. analysis_data = self._load_analysis_data()
  4394. if not analysis_data:
  4395. return None
  4396. all_results, id_to_name, title_info, new_titles, word_groups, filter_words, global_filters = (
  4397. analysis_data
  4398. )
  4399. # 运行分析流水线
  4400. stats, html_file = self._run_analysis_pipeline(
  4401. all_results,
  4402. mode_strategy["summary_mode"],
  4403. title_info,
  4404. new_titles,
  4405. word_groups,
  4406. filter_words,
  4407. id_to_name,
  4408. is_daily_summary=True,
  4409. global_filters=global_filters,
  4410. )
  4411. print(f"{summary_type}报告已生成: {html_file}")
  4412. # 发送通知
  4413. self._send_notification_if_needed(
  4414. stats,
  4415. mode_strategy["summary_report_type"],
  4416. mode_strategy["summary_mode"],
  4417. failed_ids=[],
  4418. new_titles=new_titles,
  4419. id_to_name=id_to_name,
  4420. html_file_path=html_file,
  4421. )
  4422. return html_file
  4423. def _generate_summary_html(self, mode: str = "daily") -> Optional[str]:
  4424. """生成汇总HTML"""
  4425. summary_type = "当前榜单汇总" if mode == "current" else "当日汇总"
  4426. print(f"生成{summary_type}HTML...")
  4427. # 加载分析数据
  4428. analysis_data = self._load_analysis_data()
  4429. if not analysis_data:
  4430. return None
  4431. all_results, id_to_name, title_info, new_titles, word_groups, filter_words, global_filters = (
  4432. analysis_data
  4433. )
  4434. # 运行分析流水线
  4435. _, html_file = self._run_analysis_pipeline(
  4436. all_results,
  4437. mode,
  4438. title_info,
  4439. new_titles,
  4440. word_groups,
  4441. filter_words,
  4442. id_to_name,
  4443. is_daily_summary=True,
  4444. global_filters=global_filters,
  4445. )
  4446. print(f"{summary_type}HTML已生成: {html_file}")
  4447. return html_file
  4448. def _initialize_and_check_config(self) -> None:
  4449. """通用初始化和配置检查"""
  4450. now = get_beijing_time()
  4451. print(f"当前北京时间: {now.strftime('%Y-%m-%d %H:%M:%S')}")
  4452. if not CONFIG["ENABLE_CRAWLER"]:
  4453. print("爬虫功能已禁用(ENABLE_CRAWLER=False),程序退出")
  4454. return
  4455. has_notification = self._has_notification_configured()
  4456. if not CONFIG["ENABLE_NOTIFICATION"]:
  4457. print("通知功能已禁用(ENABLE_NOTIFICATION=False),将只进行数据抓取")
  4458. elif not has_notification:
  4459. print("未配置任何通知渠道,将只进行数据抓取,不发送通知")
  4460. else:
  4461. print("通知功能已启用,将发送通知")
  4462. mode_strategy = self._get_mode_strategy()
  4463. print(f"报告模式: {self.report_mode}")
  4464. print(f"运行模式: {mode_strategy['description']}")
  4465. def _crawl_data(self) -> Tuple[Dict, Dict, List]:
  4466. """执行数据爬取"""
  4467. ids = []
  4468. for platform in CONFIG["PLATFORMS"]:
  4469. if "name" in platform:
  4470. ids.append((platform["id"], platform["name"]))
  4471. else:
  4472. ids.append(platform["id"])
  4473. print(
  4474. f"配置的监控平台: {[p.get('name', p['id']) for p in CONFIG['PLATFORMS']]}"
  4475. )
  4476. print(f"开始爬取数据,请求间隔 {self.request_interval} 毫秒")
  4477. ensure_directory_exists("output")
  4478. results, id_to_name, failed_ids = self.data_fetcher.crawl_websites(
  4479. ids, self.request_interval
  4480. )
  4481. title_file = save_titles_to_file(results, id_to_name, failed_ids)
  4482. print(f"标题已保存到: {title_file}")
  4483. return results, id_to_name, failed_ids
  4484. def _execute_mode_strategy(
  4485. self, mode_strategy: Dict, results: Dict, id_to_name: Dict, failed_ids: List
  4486. ) -> Optional[str]:
  4487. """执行模式特定逻辑"""
  4488. # 获取当前监控平台ID列表
  4489. current_platform_ids = [platform["id"] for platform in CONFIG["PLATFORMS"]]
  4490. new_titles = detect_latest_new_titles(current_platform_ids)
  4491. time_info = Path(save_titles_to_file(results, id_to_name, failed_ids)).stem
  4492. word_groups, filter_words, global_filters = load_frequency_words()
  4493. # current模式下,实时推送需要使用完整的历史数据来保证统计信息的完整性
  4494. if self.report_mode == "current":
  4495. # 加载完整的历史数据(已按当前平台过滤)
  4496. analysis_data = self._load_analysis_data()
  4497. if analysis_data:
  4498. (
  4499. all_results,
  4500. historical_id_to_name,
  4501. historical_title_info,
  4502. historical_new_titles,
  4503. _,
  4504. _,
  4505. _,
  4506. ) = analysis_data
  4507. print(
  4508. f"current模式:使用过滤后的历史数据,包含平台:{list(all_results.keys())}"
  4509. )
  4510. stats, html_file = self._run_analysis_pipeline(
  4511. all_results,
  4512. self.report_mode,
  4513. historical_title_info,
  4514. historical_new_titles,
  4515. word_groups,
  4516. filter_words,
  4517. historical_id_to_name,
  4518. failed_ids=failed_ids,
  4519. global_filters=global_filters,
  4520. )
  4521. combined_id_to_name = {**historical_id_to_name, **id_to_name}
  4522. print(f"HTML报告已生成: {html_file}")
  4523. # 发送实时通知(使用完整历史数据的统计结果)
  4524. summary_html = None
  4525. if mode_strategy["should_send_realtime"]:
  4526. self._send_notification_if_needed(
  4527. stats,
  4528. mode_strategy["realtime_report_type"],
  4529. self.report_mode,
  4530. failed_ids=failed_ids,
  4531. new_titles=historical_new_titles,
  4532. id_to_name=combined_id_to_name,
  4533. html_file_path=html_file,
  4534. )
  4535. else:
  4536. print("❌ 严重错误:无法读取刚保存的数据文件")
  4537. raise RuntimeError("数据一致性检查失败:保存后立即读取失败")
  4538. else:
  4539. title_info = self._prepare_current_title_info(results, time_info)
  4540. stats, html_file = self._run_analysis_pipeline(
  4541. results,
  4542. self.report_mode,
  4543. title_info,
  4544. new_titles,
  4545. word_groups,
  4546. filter_words,
  4547. id_to_name,
  4548. failed_ids=failed_ids,
  4549. global_filters=global_filters,
  4550. )
  4551. print(f"HTML报告已生成: {html_file}")
  4552. # 发送实时通知(如果需要)
  4553. summary_html = None
  4554. if mode_strategy["should_send_realtime"]:
  4555. self._send_notification_if_needed(
  4556. stats,
  4557. mode_strategy["realtime_report_type"],
  4558. self.report_mode,
  4559. failed_ids=failed_ids,
  4560. new_titles=new_titles,
  4561. id_to_name=id_to_name,
  4562. html_file_path=html_file,
  4563. )
  4564. # 生成汇总报告(如果需要)
  4565. summary_html = None
  4566. if mode_strategy["should_generate_summary"]:
  4567. if mode_strategy["should_send_realtime"]:
  4568. # 如果已经发送了实时通知,汇总只生成HTML不发送通知
  4569. summary_html = self._generate_summary_html(
  4570. mode_strategy["summary_mode"]
  4571. )
  4572. else:
  4573. # daily模式:直接生成汇总报告并发送通知
  4574. summary_html = self._generate_summary_report(mode_strategy)
  4575. # 打开浏览器(仅在非容器环境)
  4576. if self._should_open_browser() and html_file:
  4577. if summary_html:
  4578. summary_url = "file://" + str(Path(summary_html).resolve())
  4579. print(f"正在打开汇总报告: {summary_url}")
  4580. webbrowser.open(summary_url)
  4581. else:
  4582. file_url = "file://" + str(Path(html_file).resolve())
  4583. print(f"正在打开HTML报告: {file_url}")
  4584. webbrowser.open(file_url)
  4585. elif self.is_docker_container and html_file:
  4586. if summary_html:
  4587. print(f"汇总报告已生成(Docker环境): {summary_html}")
  4588. else:
  4589. print(f"HTML报告已生成(Docker环境): {html_file}")
  4590. return summary_html
  4591. def run(self) -> None:
  4592. """执行分析流程"""
  4593. try:
  4594. self._initialize_and_check_config()
  4595. mode_strategy = self._get_mode_strategy()
  4596. results, id_to_name, failed_ids = self._crawl_data()
  4597. self._execute_mode_strategy(mode_strategy, results, id_to_name, failed_ids)
  4598. except Exception as e:
  4599. print(f"分析流程执行出错: {e}")
  4600. raise
  4601. def main():
  4602. try:
  4603. analyzer = NewsAnalyzer()
  4604. analyzer.run()
  4605. except FileNotFoundError as e:
  4606. print(f"❌ 配置文件错误: {e}")
  4607. print("\n请确保以下文件存在:")
  4608. print(" • config/config.yaml")
  4609. print(" • config/frequency_words.txt")
  4610. print("\n参考项目文档进行正确配置")
  4611. except Exception as e:
  4612. print(f"❌ 程序运行错误: {e}")
  4613. raise
  4614. if __name__ == "__main__":
  4615. main()