from abc import ABC, abstractmethod import asyncio import queue from typing import Any, Callable, Coroutine, Generic, TypeVar from loguru import logger from playwright.async_api import ( Page, Playwright, async_playwright, Browser, BrowserContext, Error as PlaywrightError, ) from .config import web_render_config from playwright.async_api import ConsoleMessage, Page T = TypeVar("T") TFunction = Callable[[T], Coroutine[Any, Any, Any]] PageFunction = Callable[[Page], Coroutine[Any, Any, Any]] class WebRenderer: browser_pool: queue.Queue["WebRendererInstance"] = queue.Queue() context_pool: dict[int, BrowserContext] = {} # 长期挂载的浏览器上下文池 page_pool: dict[str, Page] = {} # 长期挂载的页面池 @classmethod async def get_browser_instance(cls) -> "WebRendererInstance": if cls.browser_pool.empty(): if web_render_config.module_web_render_playwright_ws: instance = await RemotePlaywrightInstance.create( web_render_config.module_web_render_playwright_ws ) else: instance = await LocalPlaywrightInstance.create() cls.browser_pool.put(instance) instance = cls.browser_pool.get() cls.browser_pool.put(instance) return instance @classmethod async def render( cls, url: str, target: str, params: dict = {}, other_function: PageFunction | None = None, timeout: int = 30, ) -> bytes: """ 访问指定URL并返回截图 :param url: 目标URL :param target: 渲染目标,如 ".box"、"#main" 等CSS选择器 :param timeout: 页面加载超时时间,单位秒 :param params: URL键值对参数 :param other_function: 其他自定义操作函数,接受page参数 :return: 截图的字节数据 """ instance = await cls.get_browser_instance() logger.debug( f"Using WebRendererInstance {id(instance)} to render {url} targeting {target}" ) return await instance.render( url, target, params=params, other_function=other_function, timeout=timeout ) @classmethod async def render_file( cls, file_path: str, target: str, params: dict = {}, other_function: PageFunction | None = None, timeout: int = 30, ) -> bytes: """ 访问指定本地文件URL并返回截图 :param file_path: 目标文件路径 :param target: 渲染目标,如 ".box"、"#main" 等CSS选择器 :param timeout: 页面加载超时时间,单位秒 :param params: URL键值对参数 :param other_function: 其他自定义操作函数,接受page参数 :return: 截图的字节数据 """ instance = await cls.get_browser_instance() logger.debug( f"Using WebRendererInstance {id(instance)} to render file {file_path} targeting {target}" ) return await instance.render_file( file_path, target, params=params, other_function=other_function, timeout=timeout, ) @classmethod async def render_with_persistent_page( cls, page_id: str, url: str, target: str, params: dict = {}, other_function: PageFunction | None = None, timeout: int = 30, ) -> bytes: """ 使用长期挂载的页面进行渲染 :param page_id: 页面唯一标识符 :param target: 渲染目标,如 ".box"、"#main" 等CSS选择器 :param timeout: 页面加载超时时间,单位秒 :param params: URL键值对参数 :param other_function: 其他自定义操作函数,接受page参数 :return: 截图的字节数据 """ instance = await cls.get_browser_instance() logger.debug( f"Using WebRendererInstance {id(instance)} to render with persistent page {page_id} targeting {target}" ) return await instance.render_with_persistent_page( page_id, url, target, params=params, other_function=other_function, timeout=timeout, ) @classmethod async def get_persistent_page(cls, page_id: str, url: str) -> Page: """ 获取长期挂载的页面,如果不存在则创建一个新的页面并存储 """ if page_id in cls.page_pool: return cls.page_pool[page_id] async def on_console(msg: ConsoleMessage): logger.debug(f"WEB CONSOLE {msg.text}") instance = await cls.get_browser_instance() if isinstance(instance, RemotePlaywrightInstance): context = await instance.browser.new_context() page = await context.new_page() await page.goto(url) cls.page_pool[page_id] = page logger.debug(f"Created new persistent page for page_id {page_id}, navigated to {url}") page.on('console', on_console) return page elif isinstance(instance, LocalPlaywrightInstance): context = await instance.browser.new_context() page = await context.new_page() await page.goto(url) cls.page_pool[page_id] = page logger.debug(f"Created new persistent page for page_id {page_id}, navigated to {url}") page.on('console', on_console) return page else: raise NotImplementedError("Unsupported WebRendererInstance type") @classmethod async def close_persistent_page(cls, page_id: str) -> None: """ 关闭并移除长期挂载的页面 :param page_id: 页面唯一标识符 """ if page_id in cls.page_pool: page = cls.page_pool[page_id] await page.close() del cls.page_pool[page_id] logger.debug(f"Closed and removed persistent page for page_id {page_id}") class WebRendererInstance(ABC, Generic[T]): @abstractmethod async def render( self, url: str, target: str, index: int = 0, params: dict[str, Any] | None = None, other_function: TFunction | None = None, timeout: int = 30, ) -> bytes: ... @abstractmethod async def render_file( self, file_path: str, target: str, index: int = 0, params: dict[str, Any] | None = None, other_function: PageFunction | None = None, timeout: int = 30, ) -> bytes: ... @abstractmethod async def render_with_persistent_page( self, page_id: str, url: str, target: str, params: dict = {}, other_function: PageFunction | None = None, timeout: int = 30, ) -> bytes: ... class PlaywrightInstance(WebRendererInstance[Page]): def __init__(self) -> None: super().__init__() self.lock = asyncio.Lock() @property @abstractmethod def browser(self) -> Browser: ... async def render( self, url: str, target: str, index: int = 0, params: dict[str, Any] | None = None, other_function: PageFunction | None = None, timeout: int = 30, ) -> bytes: """ 访问指定URL并返回截图 :param url: 目标URL :param target: 渲染目标,如 ".box"、"#main" 等CSS选择器 :param timeout: 页面加载超时时间,单位秒 :param index: 如果目标是一个列表,指定要截图的元素索引 :param params: URL键值对参数 :param other_function: 其他自定义操作函数,接受page参数 :return: 截图的字节数据 """ async with self.lock: context = await self.browser.new_context() page = await context.new_page() screenshot = await self.inner_render( page, url, target, index, params or {}, other_function, timeout ) await page.close() await context.close() return screenshot async def render_file( self, file_path: str, target: str, index: int = 0, params: dict[str, Any] | None = None, other_function: PageFunction | None = None, timeout: int = 30, ) -> bytes: file_path = "file:///" + str(file_path).replace("\\", "/") return await self.render( file_path, target, index, params or {}, other_function, timeout ) async def render_with_persistent_page( self, page_id: str, url: str, target: str, params: dict = {}, other_function: PageFunction | None = None, timeout: int = 30, ) -> bytes: page = await WebRenderer.get_persistent_page(page_id, url) screenshot = await self.inner_render( page, url, target, 0, params, other_function, timeout ) return screenshot async def inner_render( self, page: Page, url: str, target: str, index: int = 0, params: dict = {}, other_function: PageFunction | None = None, timeout: int = 30, ) -> bytes: logger.debug(f"Navigating to {url} with timeout {timeout}") url_with_params = url + ( "?" + "&".join(f"{k}={v}" for k, v in params.items()) if params else "" ) await page.goto(url_with_params, timeout=timeout * 1000, wait_until="load") logger.debug("Page loaded successfully") # 等待目标元素出现 await page.wait_for_selector(target, timeout=timeout * 1000) logger.debug(f"Target element '{target}' found, taking screenshot") if other_function: await other_function(page) elements = await page.query_selector_all(target) if not elements: logger.warning(f"Target element '{target}' not found on the page.") elements = await page.query_selector_all('body') if index >= len(elements): logger.warning(f"Index {index} out of range for elements matching '{target}'") index = 0 element = elements[index] screenshot = await element.screenshot() logger.debug("Screenshot taken successfully") return screenshot class LocalPlaywrightInstance(PlaywrightInstance): def __init__(self): self._playwright: Playwright | None = None self._browser: Browser | None = None super().__init__() @property def playwright(self) -> Playwright: assert self._playwright is not None return self._playwright @property def browser(self) -> Browser: assert self._browser is not None return self._browser async def init(self): self._playwright = await async_playwright().start() self._browser = await self.playwright.chromium.launch(headless=True) @classmethod async def create(cls) -> "WebRendererInstance": instance = cls() await instance.init() return instance async def close(self): await self.browser.close() await self.playwright.stop() class RemotePlaywrightInstance(PlaywrightInstance): def __init__(self, ws_endpoint: str) -> None: self._playwright: Playwright | None = None self._browser: Browser | None = None self._ws_endpoint = ws_endpoint super().__init__() @property def playwright(self) -> Playwright: assert self._playwright is not None, "Playwright must be initialized by calling init()." return self._playwright @property def browser(self) -> Browser: assert self._browser is not None, "Browser must be connected by calling init()." return self._browser async def init(self): logger.info(f"尝试连接远程 Playwright 服务器: {self._ws_endpoint}") self._playwright = await async_playwright().start() try: self._browser = await self.playwright.chromium.connect( self._ws_endpoint ) logger.info("成功连接到远程 Playwright 服务器。") except PlaywrightError as e: await self.playwright.stop() raise ConnectionError( f"无法连接到远程 Playwright 服务器 ({self._ws_endpoint}):{e}" ) from e @classmethod async def create(cls, ws_endpoint: str) -> "RemotePlaywrightInstance": """ 创建并初始化远程 Playwright 实例的工厂方法。 """ instance = cls(ws_endpoint) await instance.init() return instance async def close(self): """ 断开与远程浏览器的连接并停止本地 Playwright 实例。 """ if self._browser: await self.browser.close() if self._playwright: await self.playwright.stop() print("已断开远程连接,本地 Playwright 实例已停止。")