Files
konabot/konabot/common/web_render/core.py
MixBadGun 500053e630
All checks were successful
continuous-integration/drone/push Build is passing
更稳定的 MarkDown 和 LaTeX 生成!
2025-11-10 21:59:45 +08:00

404 lines
13 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

from abc import ABC, abstractmethod
import asyncio
import queue
from typing import Any, Callable, Coroutine, Generic, TypeVar
from loguru import logger
from playwright.async_api import (
Page,
Playwright,
async_playwright,
Browser,
BrowserContext,
Error as PlaywrightError,
)
from .config import web_render_config
from playwright.async_api import ConsoleMessage, Page
T = TypeVar("T")
TFunction = Callable[[T], Coroutine[Any, Any, Any]]
PageFunction = Callable[[Page], Coroutine[Any, Any, Any]]
class WebRenderer:
browser_pool: queue.Queue["WebRendererInstance"] = queue.Queue()
context_pool: dict[int, BrowserContext] = {} # 长期挂载的浏览器上下文池
page_pool: dict[str, Page] = {} # 长期挂载的页面池
@classmethod
async def get_browser_instance(cls) -> "WebRendererInstance":
if cls.browser_pool.empty():
if web_render_config.module_web_render_playwright_ws:
instance = await RemotePlaywrightInstance.create(
web_render_config.module_web_render_playwright_ws
)
else:
instance = await LocalPlaywrightInstance.create()
cls.browser_pool.put(instance)
instance = cls.browser_pool.get()
cls.browser_pool.put(instance)
return instance
@classmethod
async def render(
cls,
url: str,
target: str,
params: dict = {},
other_function: PageFunction | None = None,
timeout: int = 30,
) -> bytes:
"""
访问指定URL并返回截图
:param url: 目标URL
:param target: 渲染目标,如 ".box""#main" 等CSS选择器
:param timeout: 页面加载超时时间,单位秒
:param params: URL键值对参数
:param other_function: 其他自定义操作函数接受page参数
:return: 截图的字节数据
"""
instance = await cls.get_browser_instance()
logger.debug(
f"Using WebRendererInstance {id(instance)} to render {url} targeting {target}"
)
return await instance.render(
url, target, params=params, other_function=other_function, timeout=timeout
)
@classmethod
async def render_file(
cls,
file_path: str,
target: str,
params: dict = {},
other_function: PageFunction | None = None,
timeout: int = 30,
) -> bytes:
"""
访问指定本地文件URL并返回截图
:param file_path: 目标文件路径
:param target: 渲染目标,如 ".box""#main" 等CSS选择器
:param timeout: 页面加载超时时间,单位秒
:param params: URL键值对参数
:param other_function: 其他自定义操作函数接受page参数
:return: 截图的字节数据
"""
instance = await cls.get_browser_instance()
logger.debug(
f"Using WebRendererInstance {id(instance)} to render file {file_path} targeting {target}"
)
return await instance.render_file(
file_path,
target,
params=params,
other_function=other_function,
timeout=timeout,
)
@classmethod
async def render_with_persistent_page(
cls,
page_id: str,
url: str,
target: str,
params: dict = {},
other_function: PageFunction | None = None,
timeout: int = 30,
) -> bytes:
"""
使用长期挂载的页面进行渲染
:param page_id: 页面唯一标识符
:param target: 渲染目标,如 ".box""#main" 等CSS选择器
:param timeout: 页面加载超时时间,单位秒
:param params: URL键值对参数
:param other_function: 其他自定义操作函数接受page参数
:return: 截图的字节数据
"""
instance = await cls.get_browser_instance()
logger.debug(
f"Using WebRendererInstance {id(instance)} to render with persistent page {page_id} targeting {target}"
)
return await instance.render_with_persistent_page(
page_id,
url,
target,
params=params,
other_function=other_function,
timeout=timeout,
)
@classmethod
async def get_persistent_page(cls, page_id: str, url: str) -> Page:
"""
获取长期挂载的页面,如果不存在则创建一个新的页面并存储
"""
if page_id in cls.page_pool:
return cls.page_pool[page_id]
async def on_console(msg: ConsoleMessage):
logger.debug(f"WEB CONSOLE {msg.text}")
instance = await cls.get_browser_instance()
if isinstance(instance, RemotePlaywrightInstance):
context = await instance.browser.new_context()
page = await context.new_page()
await page.goto(url)
cls.page_pool[page_id] = page
logger.debug(f"Created new persistent page for page_id {page_id}, navigated to {url}")
page.on('console', on_console)
return page
elif isinstance(instance, LocalPlaywrightInstance):
context = await instance.browser.new_context()
page = await context.new_page()
await page.goto(url)
cls.page_pool[page_id] = page
logger.debug(f"Created new persistent page for page_id {page_id}, navigated to {url}")
page.on('console', on_console)
return page
else:
raise NotImplementedError("Unsupported WebRendererInstance type")
@classmethod
async def close_persistent_page(cls, page_id: str) -> None:
"""
关闭并移除长期挂载的页面
:param page_id: 页面唯一标识符
"""
if page_id in cls.page_pool:
page = cls.page_pool[page_id]
await page.close()
del cls.page_pool[page_id]
logger.debug(f"Closed and removed persistent page for page_id {page_id}")
class WebRendererInstance(ABC, Generic[T]):
@abstractmethod
async def render(
self,
url: str,
target: str,
index: int = 0,
params: dict[str, Any] | None = None,
other_function: TFunction | None = None,
timeout: int = 30,
) -> bytes: ...
@abstractmethod
async def render_file(
self,
file_path: str,
target: str,
index: int = 0,
params: dict[str, Any] | None = None,
other_function: PageFunction | None = None,
timeout: int = 30,
) -> bytes: ...
@abstractmethod
async def render_with_persistent_page(
self,
page_id: str,
url: str,
target: str,
params: dict = {},
other_function: PageFunction | None = None,
timeout: int = 30,
) -> bytes: ...
class PlaywrightInstance(WebRendererInstance[Page]):
def __init__(self) -> None:
super().__init__()
self.lock = asyncio.Lock()
@property
@abstractmethod
def browser(self) -> Browser: ...
async def render(
self,
url: str,
target: str,
index: int = 0,
params: dict[str, Any] | None = None,
other_function: PageFunction | None = None,
timeout: int = 30,
) -> bytes:
"""
访问指定URL并返回截图
:param url: 目标URL
:param target: 渲染目标,如 ".box""#main" 等CSS选择器
:param timeout: 页面加载超时时间,单位秒
:param index: 如果目标是一个列表,指定要截图的元素索引
:param params: URL键值对参数
:param other_function: 其他自定义操作函数接受page参数
:return: 截图的字节数据
"""
async with self.lock:
context = await self.browser.new_context()
page = await context.new_page()
screenshot = await self.inner_render(
page, url, target, index, params or {}, other_function, timeout
)
await page.close()
await context.close()
return screenshot
async def render_file(
self,
file_path: str,
target: str,
index: int = 0,
params: dict[str, Any] | None = None,
other_function: PageFunction | None = None,
timeout: int = 30,
) -> bytes:
file_path = "file:///" + str(file_path).replace("\\", "/")
return await self.render(
file_path, target, index, params or {}, other_function, timeout
)
async def render_with_persistent_page(
self,
page_id: str,
url: str,
target: str,
params: dict = {},
other_function: PageFunction | None = None,
timeout: int = 30,
) -> bytes:
page = await WebRenderer.get_persistent_page(page_id, url)
screenshot = await self.inner_render(
page, url, target, 0, params, other_function, timeout
)
return screenshot
async def inner_render(
self,
page: Page,
url: str,
target: str,
index: int = 0,
params: dict = {},
other_function: PageFunction | None = None,
timeout: int = 30,
) -> bytes:
logger.debug(f"Navigating to {url} with timeout {timeout}")
url_with_params = url + (
"?" + "&".join(f"{k}={v}" for k, v in params.items()) if params else ""
)
await page.goto(url_with_params, timeout=timeout * 1000, wait_until="load")
logger.debug("Page loaded successfully")
# 等待目标元素出现
await page.wait_for_selector(target, timeout=timeout * 1000)
logger.debug(f"Target element '{target}' found, taking screenshot")
if other_function:
await other_function(page)
elements = await page.query_selector_all(target)
if not elements:
logger.warning(f"Target element '{target}' not found on the page.")
elements = await page.query_selector_all('body')
if index >= len(elements):
logger.warning(f"Index {index} out of range for elements matching '{target}'")
index = 0
element = elements[index]
screenshot = await element.screenshot()
logger.debug("Screenshot taken successfully")
return screenshot
class LocalPlaywrightInstance(PlaywrightInstance):
def __init__(self):
self._playwright: Playwright | None = None
self._browser: Browser | None = None
super().__init__()
@property
def playwright(self) -> Playwright:
assert self._playwright is not None
return self._playwright
@property
def browser(self) -> Browser:
assert self._browser is not None
return self._browser
async def init(self):
self._playwright = await async_playwright().start()
self._browser = await self.playwright.chromium.launch(headless=True)
@classmethod
async def create(cls) -> "WebRendererInstance":
instance = cls()
await instance.init()
return instance
async def close(self):
await self.browser.close()
await self.playwright.stop()
class RemotePlaywrightInstance(PlaywrightInstance):
def __init__(self, ws_endpoint: str) -> None:
self._playwright: Playwright | None = None
self._browser: Browser | None = None
self._ws_endpoint = ws_endpoint
super().__init__()
@property
def playwright(self) -> Playwright:
assert self._playwright is not None, "Playwright must be initialized by calling init()."
return self._playwright
@property
def browser(self) -> Browser:
assert self._browser is not None, "Browser must be connected by calling init()."
return self._browser
async def init(self):
logger.info(f"尝试连接远程 Playwright 服务器: {self._ws_endpoint}")
self._playwright = await async_playwright().start()
try:
self._browser = await self.playwright.chromium.connect(
self._ws_endpoint
)
logger.info("成功连接到远程 Playwright 服务器。")
except PlaywrightError as e:
await self.playwright.stop()
raise ConnectionError(
f"无法连接到远程 Playwright 服务器 ({self._ws_endpoint}){e}"
) from e
@classmethod
async def create(cls, ws_endpoint: str) -> "RemotePlaywrightInstance":
"""
创建并初始化远程 Playwright 实例的工厂方法。
"""
instance = cls(ws_endpoint)
await instance.init()
return instance
async def close(self):
"""
断开与远程浏览器的连接并停止本地 Playwright 实例。
"""
if self._browser:
await self.browser.close()
if self._playwright:
await self.playwright.stop()
print("已断开远程连接,本地 Playwright 实例已停止。")