geo/backend/tests/test_content_pipeline/test_html_generator.py

266 lines
7.8 KiB
Python

"""HTML生成器测试"""
import pytest
from app.services.content.html_generator import HTMLGenerator
class TestHTMLGenerator:
"""HTML生成器测试"""
def test_generate_basic_html(self):
"""基础HTML生成"""
generator = HTMLGenerator()
html = generator.generate(
content="<p>这是测试内容</p>",
platform="zhihu"
)
assert html is not None
assert isinstance(html, str)
def test_generate_for_zhihu(self):
"""知乎平台HTML生成"""
generator = HTMLGenerator()
html = generator.generate(
content="<h1>华为手机评测</h1><p>这是一篇关于华为手机的详细评测文章。</p>",
platform="zhihu"
)
assert html is not None
assert "华为手机评测" in html
def test_generate_for_wechat(self):
"""微信公众号HTML生成"""
generator = HTMLGenerator()
html = generator.generate(
content="<p>华为手机非常好用</p>",
platform="wechat"
)
assert html is not None
def test_generate_for_xiaohongshu(self):
"""小红书HTML生成"""
generator = HTMLGenerator()
html = generator.generate(
content="<p>种草笔记内容</p>",
platform="xiaohongshu"
)
assert html is not None
def test_filter_banned_tags(self):
"""禁用标签过滤"""
generator = HTMLGenerator()
html = generator.generate(
content="<script>alert('xss')</script><p>正常内容</p>",
platform="zhihu"
)
# script标签应被移除
assert "<script>" not in html
assert "正常内容" in html
def test_filter_banned_tags_wechat_external_links(self):
"""微信公众号外部链接过滤"""
generator = HTMLGenerator()
html = generator.generate(
content="<a href='http://baidu.com'>外部链接</a><p>内容</p>",
platform="wechat"
)
# 微信公众号应过滤外部链接
assert "http://baidu.com" not in html
def test_filter_banned_tags_wechat_preserves_internal(self):
"""微信公众号保留内部链接"""
generator = HTMLGenerator()
html = generator.generate(
content="<a href='https://mp.weixin.qq.com/s/test'>内部链接</a><p>内容</p>",
platform="wechat"
)
# 微信公众号应保留内部链接
assert html is not None
def test_to_markdown_h1_conversion(self):
"""H1标签转Markdown"""
generator = HTMLGenerator()
md = generator.to_markdown("<h1>标题</h1>")
assert "# 标题" in md
def test_to_markdown_h2_conversion(self):
"""H2标签转Markdown"""
generator = HTMLGenerator()
md = generator.to_markdown("<h2>二级标题</h2>")
assert "## 二级标题" in md
def test_to_markdown_h3_conversion(self):
"""H3标签转Markdown"""
generator = HTMLGenerator()
md = generator.to_markdown("<h3>三级标题</h3>")
assert "### 三级标题" in md
def test_to_markdown_paragraph_conversion(self):
"""段落标签转Markdown"""
generator = HTMLGenerator()
md = generator.to_markdown("<p>段落内容</p>")
assert "段落内容" in md
def test_to_markdown_br_conversion(self):
"""换行标签转Markdown"""
generator = HTMLGenerator()
md = generator.to_markdown("第一行<br>第二行")
assert "第一行" in md
assert "第二行" in md
def test_to_markdown_list_conversion(self):
"""列表标签转Markdown"""
generator = HTMLGenerator()
md = generator.to_markdown("<li>列表项</li>")
assert "- 列表项" in md
def test_to_markdown_code_inline(self):
"""行内代码转Markdown"""
generator = HTMLGenerator()
md = generator.to_markdown("<code>代码</code>")
assert "`代码`" in md
def test_to_markdown_blockquote(self):
"""引用标签转Markdown"""
generator = HTMLGenerator()
md = generator.to_markdown("<blockquote>引用内容</blockquote>")
assert "> 引用内容" in md
def test_to_markdown_pre_block(self):
"""代码块转Markdown"""
generator = HTMLGenerator()
md = generator.to_markdown("<pre>代码块</pre>")
assert "```" in md
def test_to_markdown_strips残留_tags(self):
"""Markdown转换清理残留标签"""
generator = HTMLGenerator()
md = generator.to_markdown("<div>内容</div>")
# div标签应被移除
assert "<div>" not in md
def test_to_plain_text_basic(self):
"""纯文本基本转换"""
generator = HTMLGenerator()
plain = generator.to_plain("<h1>标题</h1><p>段落</p>")
assert "标题" in plain
assert "段落" in plain
def test_to_plain_text_removes_tags(self):
"""纯文本移除所有标签"""
generator = HTMLGenerator()
plain = generator.to_plain("<script>alert(1)</script><p>内容</p>")
assert "<script>" not in plain
assert "<p>" not in plain
def test_to_plain_text_decodes_html_entities(self):
"""纯文本解码HTML实体"""
generator = HTMLGenerator()
plain = generator.to_plain("&lt;&gt;&amp;&quot;")
assert "<" in plain
assert ">" in plain
assert "&" in plain
assert '"' in plain
def test_to_plain_text_removes_extra_spaces(self):
"""纯文本清理多余空格"""
generator = HTMLGenerator()
plain = generator.to_plain("内容 多个 空格")
assert " " not in plain
def test_to_plain_text_removes_extra_newlines(self):
"""纯文本清理多余换行"""
generator = HTMLGenerator()
plain = generator.to_plain("内容\n\n\n换行")
# 不应有超过2个连续换行
assert "\n\n\n" not in plain
def test_generate_format_html(self):
"""HTML格式输出"""
generator = HTMLGenerator()
html = generator.generate(
content="<p>内容</p>",
platform="zhihu",
format="html"
)
assert html is not None
def test_generate_format_markdown(self):
"""Markdown格式输出"""
generator = HTMLGenerator()
result = generator.generate(
content="<h1>标题</h1>",
platform="zhihu",
format="markdown"
)
assert "# 标题" in result
def test_generate_format_plain(self):
"""纯文本格式输出"""
generator = HTMLGenerator()
result = generator.generate(
content="<p>内容</p>",
platform="zhihu",
format="plain"
)
assert "内容" in result
assert "<p>" not in result
def test_generate_invalid_platform(self):
"""无效平台处理"""
generator = HTMLGenerator()
html = generator.generate(
content="<p>内容</p>",
platform="invalid_platform"
)
# 无效平台应返回原内容
assert html is not None
def test_generate_with_empty_content(self):
"""空内容生成"""
generator = HTMLGenerator()
html = generator.generate(
content="",
platform="zhihu"
)
assert html == ""
def test_to_markdown_empty_content(self):
"""空内容转Markdown"""
generator = HTMLGenerator()
md = generator.to_markdown("")
assert md == ""
def test_to_plain_empty_content(self):
"""空内容转纯文本"""
generator = HTMLGenerator()
plain = generator.to_plain("")
assert plain == ""