404 lines
13 KiB
Python
404 lines
13 KiB
Python
from abc import ABC, abstractmethod
|
||
import asyncio
|
||
import queue
|
||
from typing import Any, Callable, Coroutine, Generic, TypeVar
|
||
from loguru import logger
|
||
from playwright.async_api import (
|
||
Page,
|
||
Playwright,
|
||
async_playwright,
|
||
Browser,
|
||
BrowserContext,
|
||
Error as PlaywrightError,
|
||
)
|
||
|
||
from .config import web_render_config
|
||
from playwright.async_api import ConsoleMessage, Page
|
||
|
||
T = TypeVar("T")
|
||
TFunction = Callable[[T], Coroutine[Any, Any, Any]]
|
||
PageFunction = Callable[[Page], Coroutine[Any, Any, Any]]
|
||
|
||
|
||
class WebRenderer:
|
||
browser_pool: queue.Queue["WebRendererInstance"] = queue.Queue()
|
||
context_pool: dict[int, BrowserContext] = {} # 长期挂载的浏览器上下文池
|
||
page_pool: dict[str, Page] = {} # 长期挂载的页面池
|
||
|
||
@classmethod
|
||
async def get_browser_instance(cls) -> "WebRendererInstance":
|
||
if cls.browser_pool.empty():
|
||
if web_render_config.module_web_render_playwright_ws:
|
||
instance = await RemotePlaywrightInstance.create(
|
||
web_render_config.module_web_render_playwright_ws
|
||
)
|
||
else:
|
||
instance = await LocalPlaywrightInstance.create()
|
||
cls.browser_pool.put(instance)
|
||
instance = cls.browser_pool.get()
|
||
cls.browser_pool.put(instance)
|
||
return instance
|
||
|
||
@classmethod
|
||
async def render(
|
||
cls,
|
||
url: str,
|
||
target: str,
|
||
params: dict = {},
|
||
other_function: PageFunction | None = None,
|
||
timeout: int = 30,
|
||
) -> bytes:
|
||
"""
|
||
访问指定URL并返回截图
|
||
|
||
:param url: 目标URL
|
||
:param target: 渲染目标,如 ".box"、"#main" 等CSS选择器
|
||
:param timeout: 页面加载超时时间,单位秒
|
||
:param params: URL键值对参数
|
||
:param other_function: 其他自定义操作函数,接受page参数
|
||
:return: 截图的字节数据
|
||
|
||
"""
|
||
instance = await cls.get_browser_instance()
|
||
logger.debug(
|
||
f"Using WebRendererInstance {id(instance)} to render {url} targeting {target}"
|
||
)
|
||
return await instance.render(
|
||
url, target, params=params, other_function=other_function, timeout=timeout
|
||
)
|
||
|
||
@classmethod
|
||
async def render_file(
|
||
cls,
|
||
file_path: str,
|
||
target: str,
|
||
params: dict = {},
|
||
other_function: PageFunction | None = None,
|
||
timeout: int = 30,
|
||
) -> bytes:
|
||
"""
|
||
访问指定本地文件URL并返回截图
|
||
|
||
:param file_path: 目标文件路径
|
||
:param target: 渲染目标,如 ".box"、"#main" 等CSS选择器
|
||
:param timeout: 页面加载超时时间,单位秒
|
||
:param params: URL键值对参数
|
||
:param other_function: 其他自定义操作函数,接受page参数
|
||
:return: 截图的字节数据
|
||
|
||
"""
|
||
instance = await cls.get_browser_instance()
|
||
logger.debug(
|
||
f"Using WebRendererInstance {id(instance)} to render file {file_path} targeting {target}"
|
||
)
|
||
return await instance.render_file(
|
||
file_path,
|
||
target,
|
||
params=params,
|
||
other_function=other_function,
|
||
timeout=timeout,
|
||
)
|
||
|
||
@classmethod
|
||
async def render_with_persistent_page(
|
||
cls,
|
||
page_id: str,
|
||
url: str,
|
||
target: str,
|
||
params: dict = {},
|
||
other_function: PageFunction | None = None,
|
||
timeout: int = 30,
|
||
) -> bytes:
|
||
"""
|
||
使用长期挂载的页面进行渲染
|
||
|
||
:param page_id: 页面唯一标识符
|
||
:param target: 渲染目标,如 ".box"、"#main" 等CSS选择器
|
||
:param timeout: 页面加载超时时间,单位秒
|
||
:param params: URL键值对参数
|
||
:param other_function: 其他自定义操作函数,接受page参数
|
||
:return: 截图的字节数据
|
||
|
||
"""
|
||
instance = await cls.get_browser_instance()
|
||
logger.debug(
|
||
f"Using WebRendererInstance {id(instance)} to render with persistent page {page_id} targeting {target}"
|
||
)
|
||
return await instance.render_with_persistent_page(
|
||
page_id,
|
||
url,
|
||
target,
|
||
params=params,
|
||
other_function=other_function,
|
||
timeout=timeout,
|
||
)
|
||
|
||
@classmethod
|
||
async def get_persistent_page(cls, page_id: str, url: str) -> Page:
|
||
"""
|
||
获取长期挂载的页面,如果不存在则创建一个新的页面并存储
|
||
"""
|
||
if page_id in cls.page_pool:
|
||
return cls.page_pool[page_id]
|
||
|
||
async def on_console(msg: ConsoleMessage):
|
||
logger.debug(f"WEB CONSOLE {msg.text}")
|
||
|
||
instance = await cls.get_browser_instance()
|
||
if isinstance(instance, RemotePlaywrightInstance):
|
||
context = await instance.browser.new_context()
|
||
page = await context.new_page()
|
||
await page.goto(url)
|
||
cls.page_pool[page_id] = page
|
||
logger.debug(f"Created new persistent page for page_id {page_id}, navigated to {url}")
|
||
|
||
page.on('console', on_console)
|
||
|
||
return page
|
||
elif isinstance(instance, LocalPlaywrightInstance):
|
||
context = await instance.browser.new_context()
|
||
page = await context.new_page()
|
||
await page.goto(url)
|
||
cls.page_pool[page_id] = page
|
||
logger.debug(f"Created new persistent page for page_id {page_id}, navigated to {url}")
|
||
|
||
page.on('console', on_console)
|
||
|
||
return page
|
||
else:
|
||
raise NotImplementedError("Unsupported WebRendererInstance type")
|
||
|
||
@classmethod
|
||
async def close_persistent_page(cls, page_id: str) -> None:
|
||
"""
|
||
关闭并移除长期挂载的页面
|
||
|
||
:param page_id: 页面唯一标识符
|
||
"""
|
||
if page_id in cls.page_pool:
|
||
page = cls.page_pool[page_id]
|
||
await page.close()
|
||
del cls.page_pool[page_id]
|
||
logger.debug(f"Closed and removed persistent page for page_id {page_id}")
|
||
|
||
|
||
class WebRendererInstance(ABC, Generic[T]):
|
||
@abstractmethod
|
||
async def render(
|
||
self,
|
||
url: str,
|
||
target: str,
|
||
index: int = 0,
|
||
params: dict[str, Any] | None = None,
|
||
other_function: TFunction | None = None,
|
||
timeout: int = 30,
|
||
) -> bytes: ...
|
||
|
||
@abstractmethod
|
||
async def render_file(
|
||
self,
|
||
file_path: str,
|
||
target: str,
|
||
index: int = 0,
|
||
params: dict[str, Any] | None = None,
|
||
other_function: PageFunction | None = None,
|
||
timeout: int = 30,
|
||
) -> bytes: ...
|
||
|
||
@abstractmethod
|
||
async def render_with_persistent_page(
|
||
self,
|
||
page_id: str,
|
||
url: str,
|
||
target: str,
|
||
params: dict = {},
|
||
other_function: PageFunction | None = None,
|
||
timeout: int = 30,
|
||
) -> bytes: ...
|
||
|
||
|
||
class PlaywrightInstance(WebRendererInstance[Page]):
|
||
def __init__(self) -> None:
|
||
super().__init__()
|
||
self.lock = asyncio.Lock()
|
||
|
||
@property
|
||
@abstractmethod
|
||
def browser(self) -> Browser: ...
|
||
|
||
async def render(
|
||
self,
|
||
url: str,
|
||
target: str,
|
||
index: int = 0,
|
||
params: dict[str, Any] | None = None,
|
||
other_function: PageFunction | None = None,
|
||
timeout: int = 30,
|
||
) -> bytes:
|
||
"""
|
||
访问指定URL并返回截图
|
||
|
||
:param url: 目标URL
|
||
:param target: 渲染目标,如 ".box"、"#main" 等CSS选择器
|
||
:param timeout: 页面加载超时时间,单位秒
|
||
:param index: 如果目标是一个列表,指定要截图的元素索引
|
||
:param params: URL键值对参数
|
||
:param other_function: 其他自定义操作函数,接受page参数
|
||
:return: 截图的字节数据
|
||
|
||
"""
|
||
async with self.lock:
|
||
context = await self.browser.new_context()
|
||
page = await context.new_page()
|
||
screenshot = await self.inner_render(
|
||
page, url, target, index, params or {}, other_function, timeout
|
||
)
|
||
await page.close()
|
||
await context.close()
|
||
return screenshot
|
||
|
||
async def render_file(
|
||
self,
|
||
file_path: str,
|
||
target: str,
|
||
index: int = 0,
|
||
params: dict[str, Any] | None = None,
|
||
other_function: PageFunction | None = None,
|
||
timeout: int = 30,
|
||
) -> bytes:
|
||
file_path = "file:///" + str(file_path).replace("\\", "/")
|
||
return await self.render(
|
||
file_path, target, index, params or {}, other_function, timeout
|
||
)
|
||
|
||
async def render_with_persistent_page(
|
||
self,
|
||
page_id: str,
|
||
url: str,
|
||
target: str,
|
||
params: dict = {},
|
||
other_function: PageFunction | None = None,
|
||
timeout: int = 30,
|
||
) -> bytes:
|
||
page = await WebRenderer.get_persistent_page(page_id, url)
|
||
screenshot = await self.inner_render(
|
||
page, url, target, 0, params, other_function, timeout
|
||
)
|
||
return screenshot
|
||
|
||
async def inner_render(
|
||
self,
|
||
page: Page,
|
||
url: str,
|
||
target: str,
|
||
index: int = 0,
|
||
params: dict = {},
|
||
other_function: PageFunction | None = None,
|
||
timeout: int = 30,
|
||
) -> bytes:
|
||
logger.debug(f"Navigating to {url} with timeout {timeout}")
|
||
url_with_params = url + (
|
||
"?" + "&".join(f"{k}={v}" for k, v in params.items()) if params else ""
|
||
)
|
||
await page.goto(url_with_params, timeout=timeout * 1000, wait_until="load")
|
||
logger.debug("Page loaded successfully")
|
||
# 等待目标元素出现
|
||
await page.wait_for_selector(target, timeout=timeout * 1000)
|
||
logger.debug(f"Target element '{target}' found, taking screenshot")
|
||
if other_function:
|
||
await other_function(page)
|
||
elements = await page.query_selector_all(target)
|
||
if not elements:
|
||
logger.warning(f"Target element '{target}' not found on the page.")
|
||
elements = await page.query_selector_all('body')
|
||
if index >= len(elements):
|
||
logger.warning(f"Index {index} out of range for elements matching '{target}'")
|
||
index = 0
|
||
element = elements[index]
|
||
screenshot = await element.screenshot()
|
||
logger.debug("Screenshot taken successfully")
|
||
return screenshot
|
||
|
||
|
||
class LocalPlaywrightInstance(PlaywrightInstance):
|
||
def __init__(self):
|
||
self._playwright: Playwright | None = None
|
||
self._browser: Browser | None = None
|
||
super().__init__()
|
||
|
||
@property
|
||
def playwright(self) -> Playwright:
|
||
assert self._playwright is not None
|
||
return self._playwright
|
||
|
||
@property
|
||
def browser(self) -> Browser:
|
||
assert self._browser is not None
|
||
return self._browser
|
||
|
||
async def init(self):
|
||
self._playwright = await async_playwright().start()
|
||
self._browser = await self.playwright.chromium.launch(headless=True)
|
||
|
||
@classmethod
|
||
async def create(cls) -> "WebRendererInstance":
|
||
instance = cls()
|
||
await instance.init()
|
||
return instance
|
||
|
||
async def close(self):
|
||
await self.browser.close()
|
||
await self.playwright.stop()
|
||
|
||
|
||
class RemotePlaywrightInstance(PlaywrightInstance):
|
||
def __init__(self, ws_endpoint: str) -> None:
|
||
self._playwright: Playwright | None = None
|
||
self._browser: Browser | None = None
|
||
self._ws_endpoint = ws_endpoint
|
||
super().__init__()
|
||
|
||
@property
|
||
def playwright(self) -> Playwright:
|
||
assert self._playwright is not None, "Playwright must be initialized by calling init()."
|
||
return self._playwright
|
||
|
||
@property
|
||
def browser(self) -> Browser:
|
||
assert self._browser is not None, "Browser must be connected by calling init()."
|
||
return self._browser
|
||
|
||
async def init(self):
|
||
logger.info(f"尝试连接远程 Playwright 服务器: {self._ws_endpoint}")
|
||
self._playwright = await async_playwright().start()
|
||
try:
|
||
self._browser = await self.playwright.chromium.connect(
|
||
self._ws_endpoint
|
||
)
|
||
logger.info("成功连接到远程 Playwright 服务器。")
|
||
except PlaywrightError as e:
|
||
await self.playwright.stop()
|
||
raise ConnectionError(
|
||
f"无法连接到远程 Playwright 服务器 ({self._ws_endpoint}):{e}"
|
||
) from e
|
||
|
||
@classmethod
|
||
async def create(cls, ws_endpoint: str) -> "RemotePlaywrightInstance":
|
||
"""
|
||
创建并初始化远程 Playwright 实例的工厂方法。
|
||
"""
|
||
instance = cls(ws_endpoint)
|
||
await instance.init()
|
||
return instance
|
||
|
||
async def close(self):
|
||
"""
|
||
断开与远程浏览器的连接并停止本地 Playwright 实例。
|
||
"""
|
||
if self._browser:
|
||
await self.browser.close()
|
||
if self._playwright:
|
||
await self.playwright.stop()
|
||
print("已断开远程连接,本地 Playwright 实例已停止。")
|
||
|