mirror of
https://github.com/openclaw/openclaw.git
synced 2026-04-05 00:43:54 +02:00
Replace proportional text estimate with binary search for the largest text prefix whose rendered Telegram HTML fits the character limit, then split at the last whitespace boundary within that verified prefix. Single words longer than the limit still hard-split (unavoidable). Markdown formatting stays balanced across split points. Fixes #36644
496 lines
19 KiB
TypeScript
496 lines
19 KiB
TypeScript
import { describe, expect, it } from "vitest";
|
|
import {
|
|
markdownToTelegramChunks,
|
|
markdownToTelegramHtml,
|
|
renderTelegramHtmlText,
|
|
wrapFileReferencesInHtml,
|
|
} from "./format.js";
|
|
|
|
describe("wrapFileReferencesInHtml", () => {
|
|
it("wraps supported file references and paths", () => {
|
|
const cases = [
|
|
["Check README.md", "Check <code>README.md</code>"],
|
|
["See HEARTBEAT.md for status", "See <code>HEARTBEAT.md</code> for status"],
|
|
["Check main.go", "Check <code>main.go</code>"],
|
|
["Run script.py", "Run <code>script.py</code>"],
|
|
["Check backup.pl", "Check <code>backup.pl</code>"],
|
|
["Run backup.sh", "Run <code>backup.sh</code>"],
|
|
["Look at squad/friday/HEARTBEAT.md", "Look at <code>squad/friday/HEARTBEAT.md</code>"],
|
|
] as const;
|
|
for (const [input, expected] of cases) {
|
|
expect(wrapFileReferencesInHtml(input), input).toContain(expected);
|
|
}
|
|
});
|
|
|
|
it("does not wrap inside protected html contexts", () => {
|
|
const cases = [
|
|
"Already <code>wrapped.md</code> here",
|
|
"<pre><code>README.md</code></pre>",
|
|
'<a href="README.md">Link</a>',
|
|
'Visit <a href="https://example.com/README.md">example.com/README.md</a>',
|
|
] as const;
|
|
for (const input of cases) {
|
|
const result = wrapFileReferencesInHtml(input);
|
|
expect(result, input).toBe(input);
|
|
}
|
|
expect(wrapFileReferencesInHtml(cases[0])).not.toContain("<code><code>");
|
|
});
|
|
|
|
it("handles mixed content correctly", () => {
|
|
const result = wrapFileReferencesInHtml("Check README.md and CONTRIBUTING.md");
|
|
expect(result).toContain("<code>README.md</code>");
|
|
expect(result).toContain("<code>CONTRIBUTING.md</code>");
|
|
});
|
|
|
|
it("handles boundary and punctuation wrapping cases", () => {
|
|
const cases = [
|
|
{ input: "No markdown files here", contains: undefined },
|
|
{ input: "File.md at start", contains: "<code>File.md</code>" },
|
|
{ input: "Ends with file.md", contains: "<code>file.md</code>" },
|
|
{ input: "See README.md.", contains: "<code>README.md</code>." },
|
|
{ input: "See README.md,", contains: "<code>README.md</code>," },
|
|
{ input: "(README.md)", contains: "(<code>README.md</code>)" },
|
|
{ input: "README.md:", contains: "<code>README.md</code>:" },
|
|
] as const;
|
|
|
|
for (const testCase of cases) {
|
|
const result = wrapFileReferencesInHtml(testCase.input);
|
|
if (!testCase.contains) {
|
|
expect(result).not.toContain("<code>");
|
|
continue;
|
|
}
|
|
expect(result).toContain(testCase.contains);
|
|
}
|
|
});
|
|
|
|
it("de-linkifies auto-linkified anchors for plain files and paths", () => {
|
|
const cases = [
|
|
{
|
|
input: '<a href="http://README.md">README.md</a>',
|
|
expected: "<code>README.md</code>",
|
|
},
|
|
{
|
|
input: '<a href="http://squad/friday/HEARTBEAT.md">squad/friday/HEARTBEAT.md</a>',
|
|
expected: "<code>squad/friday/HEARTBEAT.md</code>",
|
|
},
|
|
] as const;
|
|
for (const testCase of cases) {
|
|
expect(wrapFileReferencesInHtml(testCase.input)).toBe(testCase.expected);
|
|
}
|
|
});
|
|
|
|
it("preserves explicit links where label differs from href", () => {
|
|
const cases = [
|
|
'<a href="http://README.md">click here</a>',
|
|
'<a href="http://other.md">README.md</a>',
|
|
] as const;
|
|
for (const input of cases) {
|
|
expect(wrapFileReferencesInHtml(input)).toBe(input);
|
|
}
|
|
});
|
|
|
|
it("wraps file ref after closing anchor tag", () => {
|
|
const input = '<a href="https://example.com">link</a> then README.md';
|
|
const result = wrapFileReferencesInHtml(input);
|
|
expect(result).toContain("</a> then <code>README.md</code>");
|
|
});
|
|
});
|
|
|
|
describe("renderTelegramHtmlText - file reference wrapping", () => {
|
|
it("wraps file references in markdown mode", () => {
|
|
const result = renderTelegramHtmlText("Check README.md");
|
|
expect(result).toContain("<code>README.md</code>");
|
|
});
|
|
|
|
it("does not wrap in HTML mode (trusts caller markup)", () => {
|
|
// textMode: "html" should pass through unchanged - caller owns the markup
|
|
const result = renderTelegramHtmlText("Check README.md", { textMode: "html" });
|
|
expect(result).toBe("Check README.md");
|
|
expect(result).not.toContain("<code>");
|
|
});
|
|
|
|
it("does not double-wrap already code-formatted content", () => {
|
|
const result = renderTelegramHtmlText("Already `wrapped.md` here");
|
|
// Should have code tags but not nested
|
|
expect(result).toContain("<code>");
|
|
expect(result).not.toContain("<code><code>");
|
|
});
|
|
});
|
|
|
|
describe("markdownToTelegramHtml - file reference wrapping", () => {
|
|
it("wraps file references by default", () => {
|
|
const result = markdownToTelegramHtml("Check README.md");
|
|
expect(result).toContain("<code>README.md</code>");
|
|
});
|
|
|
|
it("can skip wrapping when requested", () => {
|
|
const result = markdownToTelegramHtml("Check README.md", { wrapFileRefs: false });
|
|
expect(result).not.toContain("<code>README.md</code>");
|
|
});
|
|
|
|
it("wraps multiple file types in a single message", () => {
|
|
const result = markdownToTelegramHtml("Edit main.go and script.py");
|
|
expect(result).toContain("<code>main.go</code>");
|
|
expect(result).toContain("<code>script.py</code>");
|
|
});
|
|
|
|
it("preserves real URLs as anchor tags", () => {
|
|
const result = markdownToTelegramHtml("Visit https://example.com");
|
|
expect(result).toContain('<a href="https://example.com">');
|
|
});
|
|
|
|
it("preserves explicit markdown links even when href looks like a file ref", () => {
|
|
const result = markdownToTelegramHtml("[docs](http://README.md)");
|
|
expect(result).toContain('<a href="http://README.md">docs</a>');
|
|
});
|
|
|
|
it("wraps file ref after real URL in same message", () => {
|
|
const result = markdownToTelegramHtml("Visit https://example.com and README.md");
|
|
expect(result).toContain('<a href="https://example.com">');
|
|
expect(result).toContain("<code>README.md</code>");
|
|
});
|
|
});
|
|
|
|
describe("markdownToTelegramChunks - file reference wrapping", () => {
|
|
it("wraps file references in chunked output", () => {
|
|
const chunks = markdownToTelegramChunks("Check README.md and backup.sh", 4096);
|
|
expect(chunks.length).toBeGreaterThan(0);
|
|
expect(chunks[0].html).toContain("<code>README.md</code>");
|
|
expect(chunks[0].html).toContain("<code>backup.sh</code>");
|
|
});
|
|
|
|
it("keeps rendered html chunks within the provided limit", () => {
|
|
const input = "<".repeat(1500);
|
|
const chunks = markdownToTelegramChunks(input, 512);
|
|
expect(chunks.length).toBeGreaterThan(1);
|
|
expect(chunks.map((chunk) => chunk.text).join("")).toBe(input);
|
|
expect(chunks.every((chunk) => chunk.html.length <= 512)).toBe(true);
|
|
});
|
|
|
|
it("preserves whitespace when html-limit retry splitting runs", () => {
|
|
const input = "a < b";
|
|
const chunks = markdownToTelegramChunks(input, 5);
|
|
expect(chunks.length).toBeGreaterThan(1);
|
|
expect(chunks.map((chunk) => chunk.text).join("")).toBe(input);
|
|
expect(chunks.every((chunk) => chunk.html.length <= 5)).toBe(true);
|
|
});
|
|
|
|
it("prefers word boundaries when escaped html shrinks the retry window", () => {
|
|
const input = "alpha <<";
|
|
const chunks = markdownToTelegramChunks(input, 8);
|
|
expect(chunks.map((chunk) => chunk.text).join("")).toBe(input);
|
|
expect(chunks[0]?.text).toBe("alpha ");
|
|
expect(chunks.every((chunk) => chunk.html.length <= 8)).toBe(true);
|
|
});
|
|
|
|
it("prefers word boundaries when html-limit retry splits formatted prose", () => {
|
|
const input = "**Which of these**";
|
|
const chunks = markdownToTelegramChunks(input, 16);
|
|
expect(chunks.map((chunk) => chunk.text)).toEqual(["Which of ", "these"]);
|
|
expect(chunks.every((chunk) => chunk.html.length <= 16)).toBe(true);
|
|
});
|
|
|
|
it("preserves formatting while splitting at word boundaries", () => {
|
|
const input = "**alpha <<**";
|
|
const chunks = markdownToTelegramChunks(input, 13);
|
|
expect(chunks.map((chunk) => chunk.text).join("")).toBe("alpha <<");
|
|
expect(chunks[0]?.text).toBe("alpha ");
|
|
expect(chunks.every((chunk) => chunk.html.length <= 13)).toBe(true);
|
|
expect(chunks.every((chunk) => chunk.html.startsWith("<b>") && chunk.html.endsWith("</b>"))).toBe(
|
|
true,
|
|
);
|
|
});
|
|
|
|
it("does not rely on monotonic html length for sliced file refs", () => {
|
|
const input = "README.md<";
|
|
const chunks = markdownToTelegramChunks(input, 22);
|
|
expect(chunks.map((chunk) => chunk.text).join("")).toBe(input);
|
|
expect(chunks[0]?.text).toBe("README.md");
|
|
expect(chunks[0]?.html).toBe("<code>README.md</code>");
|
|
expect(chunks.every((chunk) => chunk.html.length <= 22)).toBe(true);
|
|
});
|
|
|
|
it("gracefully returns the original chunk when tag overhead exceeds the limit", () => {
|
|
const input = "**ab**";
|
|
expect(() => markdownToTelegramChunks(input, 6)).not.toThrow();
|
|
const chunks = markdownToTelegramChunks(input, 6);
|
|
expect(chunks).toHaveLength(1);
|
|
expect(chunks[0]?.text).toBe("ab");
|
|
expect(chunks[0]?.html).toBe("<b>ab</b>");
|
|
});
|
|
|
|
it("falls back to in-paren word boundaries when the parenthesis is unbalanced", () => {
|
|
const input = "**foo (bar baz qux quux**";
|
|
const chunks = markdownToTelegramChunks(input, 20);
|
|
expect(chunks.map((chunk) => chunk.text)).toEqual(["foo", "(bar baz qux ", "quux"]);
|
|
expect(chunks.every((chunk) => chunk.html.length <= 20)).toBe(true);
|
|
});
|
|
|
|
it("falls back to hard splits when a single word exceeds the limit", () => {
|
|
const input = "supercalifragilistic";
|
|
const chunks = markdownToTelegramChunks(input, 8);
|
|
expect(chunks.map((chunk) => chunk.text)).toEqual(["supercal", "ifragili", "stic"]);
|
|
expect(chunks.every((chunk) => chunk.html.length <= 8)).toBe(true);
|
|
});
|
|
|
|
it("does not emit whitespace-only chunks during html-limit retry splitting", () => {
|
|
const input = "**ab <<**";
|
|
const chunks = markdownToTelegramChunks(input, 11);
|
|
expect(chunks.map((chunk) => chunk.text).join("")).toBe("ab <<");
|
|
expect(chunks.every((chunk) => chunk.text.trim().length > 0)).toBe(true);
|
|
expect(chunks.every((chunk) => chunk.html.length <= 11)).toBe(true);
|
|
});
|
|
|
|
it("preserves paragraph separators when retry chunking produces whitespace-only spans", () => {
|
|
const input = "ab\n\n<<";
|
|
const chunks = markdownToTelegramChunks(input, 6);
|
|
expect(chunks.map((chunk) => chunk.text).join("")).toBe(input);
|
|
expect(chunks.every((chunk) => chunk.html.length <= 6)).toBe(true);
|
|
});
|
|
});
|
|
|
|
describe("edge cases", () => {
|
|
it("wraps file refs inside emphasis tags", () => {
|
|
const cases = [
|
|
["**README.md**", "<b><code>README.md</code></b>"],
|
|
["*script.py*", "<i><code>script.py</code></i>"],
|
|
] as const;
|
|
for (const [input, expected] of cases) {
|
|
expect(markdownToTelegramHtml(input), input).toBe(expected);
|
|
}
|
|
});
|
|
|
|
it("does not wrap inside fenced code blocks", () => {
|
|
const result = markdownToTelegramHtml("```\nREADME.md\n```");
|
|
expect(result).toBe("<pre><code>README.md\n</code></pre>");
|
|
expect(result).not.toContain("<code><code>");
|
|
});
|
|
|
|
it("preserves real URL/domain paths as anchors", () => {
|
|
const cases = [
|
|
{
|
|
input: "example.com/README.md",
|
|
href: 'href="http://example.com/README.md"',
|
|
},
|
|
{
|
|
input: "https://github.com/foo/README.md",
|
|
href: 'href="https://github.com/foo/README.md"',
|
|
},
|
|
] as const;
|
|
for (const testCase of cases) {
|
|
const result = markdownToTelegramHtml(testCase.input);
|
|
expect(result).toContain(`<a ${testCase.href}>`);
|
|
expect(result).not.toContain("<code>");
|
|
}
|
|
});
|
|
|
|
it("handles wrapFileRefs: false (plain text output)", () => {
|
|
const result = markdownToTelegramHtml("README.md", { wrapFileRefs: false });
|
|
// buildTelegramLink returns null, so no <a> tag; wrapFileRefs: false skips <code>
|
|
expect(result).toBe("README.md");
|
|
});
|
|
|
|
it("classifies extension-like tokens as file refs or domains", () => {
|
|
const cases = [
|
|
{
|
|
name: "supported file-style extensions",
|
|
input: "Makefile.am and code.at and app.be and main.cc",
|
|
contains: [
|
|
"<code>Makefile.am</code>",
|
|
"<code>code.at</code>",
|
|
"<code>app.be</code>",
|
|
"<code>main.cc</code>",
|
|
],
|
|
},
|
|
{
|
|
name: "popular domain TLDs stay links",
|
|
input: "Check x.ai and vercel.io and app.tv and radio.fm",
|
|
contains: [
|
|
'<a href="http://x.ai">',
|
|
'<a href="http://vercel.io">',
|
|
'<a href="http://app.tv">',
|
|
'<a href="http://radio.fm">',
|
|
],
|
|
},
|
|
{
|
|
name: ".co stays links",
|
|
input: "Visit t.co and openclaw.co",
|
|
contains: ['<a href="http://t.co">', '<a href="http://openclaw.co">'],
|
|
notContains: ["<code>t.co</code>", "<code>openclaw.co</code>"],
|
|
},
|
|
{
|
|
name: "non-target extensions stay plain text",
|
|
input: "image.png and style.css and script.js",
|
|
notContains: ["<code>image.png</code>", "<code>style.css</code>", "<code>script.js</code>"],
|
|
},
|
|
] as const;
|
|
for (const testCase of cases) {
|
|
const result = markdownToTelegramHtml(testCase.input);
|
|
if ("contains" in testCase && testCase.contains) {
|
|
for (const expected of testCase.contains) {
|
|
expect(result, testCase.name).toContain(expected);
|
|
}
|
|
}
|
|
if ("notContains" in testCase && testCase.notContains) {
|
|
for (const unexpected of testCase.notContains) {
|
|
expect(result, testCase.name).not.toContain(unexpected);
|
|
}
|
|
}
|
|
}
|
|
});
|
|
|
|
it("wraps file refs across boundaries, sequences, and path variants", () => {
|
|
const cases = [
|
|
{
|
|
name: "message start boundary",
|
|
input: "README.md is important",
|
|
expectedExact: "<code>README.md</code> is important",
|
|
},
|
|
{
|
|
name: "message end boundary",
|
|
input: "Check the README.md",
|
|
expectedExact: "Check the <code>README.md</code>",
|
|
},
|
|
{
|
|
name: "multiple file refs",
|
|
input: "README.md CHANGELOG.md LICENSE.md",
|
|
contains: [
|
|
"<code>README.md</code>",
|
|
"<code>CHANGELOG.md</code>",
|
|
"<code>LICENSE.md</code>",
|
|
],
|
|
},
|
|
{
|
|
name: "nested path",
|
|
input: "src/utils/helpers/format.go",
|
|
contains: ["<code>src/utils/helpers/format.go</code>"],
|
|
},
|
|
{
|
|
name: "version-like non-domain path",
|
|
input: "v1.0/README.md",
|
|
contains: ["<code>v1.0/README.md</code>"],
|
|
},
|
|
{
|
|
name: "domain with version path",
|
|
input: "example.com/v1.0/README.md",
|
|
contains: ['<a href="http://example.com/v1.0/README.md">'],
|
|
},
|
|
{
|
|
name: "hyphen underscore and uppercase extensions",
|
|
input: "my-file_name.md README.MD and SCRIPT.PY",
|
|
contains: [
|
|
"<code>my-file_name.md</code>",
|
|
"<code>README.MD</code>",
|
|
"<code>SCRIPT.PY</code>",
|
|
],
|
|
},
|
|
] as const;
|
|
for (const testCase of cases) {
|
|
const result = markdownToTelegramHtml(testCase.input);
|
|
if ("expectedExact" in testCase) {
|
|
expect(result, testCase.name).toBe(testCase.expectedExact);
|
|
}
|
|
if ("contains" in testCase && testCase.contains) {
|
|
for (const expected of testCase.contains) {
|
|
expect(result, testCase.name).toContain(expected);
|
|
}
|
|
}
|
|
}
|
|
});
|
|
|
|
it("handles nested code tags (depth tracking)", () => {
|
|
// Nested <code> inside <pre> - should not wrap inner content
|
|
const input = "<pre><code>README.md</code></pre> then script.py";
|
|
const result = wrapFileReferencesInHtml(input);
|
|
expect(result).toBe("<pre><code>README.md</code></pre> then <code>script.py</code>");
|
|
});
|
|
|
|
it("handles multiple anchor tags in sequence", () => {
|
|
const input =
|
|
'<a href="https://a.com">link1</a> README.md <a href="https://b.com">link2</a> script.py';
|
|
const result = wrapFileReferencesInHtml(input);
|
|
expect(result).toContain("</a> <code>README.md</code> <a");
|
|
expect(result).toContain("</a> <code>script.py</code>");
|
|
});
|
|
|
|
it("wraps orphaned TLD pattern after special character", () => {
|
|
// R&D.md - the & breaks the main pattern, but D.md could be auto-linked
|
|
// So we wrap the orphaned D.md part to prevent Telegram linking it
|
|
const input = "R&D.md";
|
|
const result = wrapFileReferencesInHtml(input);
|
|
expect(result).toBe("R&<code>D.md</code>");
|
|
});
|
|
|
|
it("wraps orphaned single-letter TLD patterns", () => {
|
|
// Use extensions still in the set (md, sh, py, go)
|
|
const result1 = wrapFileReferencesInHtml("X.md is cool");
|
|
expect(result1).toContain("<code>X.md</code>");
|
|
|
|
const result2 = wrapFileReferencesInHtml("Check R.sh");
|
|
expect(result2).toContain("<code>R.sh</code>");
|
|
});
|
|
|
|
it("does not match filenames containing angle brackets", () => {
|
|
// The regex character class [a-zA-Z0-9_.\\-./] doesn't include < >
|
|
// so these won't be matched and wrapped (which is correct/safe)
|
|
const input = "file<script>.md";
|
|
const result = wrapFileReferencesInHtml(input);
|
|
// Not wrapped because < breaks the filename pattern
|
|
expect(result).toBe(input);
|
|
});
|
|
|
|
it("wraps file ref before unrelated HTML tags", () => {
|
|
// x.md followed by unrelated closing tag and bold - wrap the file ref only
|
|
const input = "x.md <b>bold</b>";
|
|
const result = wrapFileReferencesInHtml(input);
|
|
expect(result).toBe("<code>x.md</code> <b>bold</b>");
|
|
});
|
|
|
|
it("handles malformed HTML with stray closing tags (negative depth)", () => {
|
|
// Stray </code> before content shouldn't break protection logic
|
|
// (depth should clamp at 0, not go negative)
|
|
const input = "</code>README.md<code>inside</code> after.md";
|
|
const result = wrapFileReferencesInHtml(input);
|
|
// README.md should be wrapped (codeDepth = 0 after clamping stray close)
|
|
expect(result).toContain("<code>README.md</code>");
|
|
// after.md should be wrapped (codeDepth = 0 after proper close)
|
|
expect(result).toContain("<code>after.md</code>");
|
|
// Should not have nested code tags
|
|
expect(result).not.toContain("<code><code>");
|
|
});
|
|
|
|
it("does not wrap orphaned TLD fragments inside protected HTML contexts", () => {
|
|
const cases = [
|
|
"<code>R&D.md</code>",
|
|
'<a href="https://example.com">R&D.md</a>',
|
|
'<a href="http://example.com/R&D.md">link</a>',
|
|
'<img src="logo/R&D.md" alt="R&D.md">',
|
|
] as const;
|
|
for (const input of cases) {
|
|
const result = wrapFileReferencesInHtml(input);
|
|
expect(result, input).toBe(input);
|
|
expect(result, input).not.toContain("<code>D.md</code>");
|
|
expect(result, input).not.toContain("<code><code>");
|
|
expect(result, input).not.toContain("</code></code>");
|
|
}
|
|
});
|
|
|
|
it("handles multiple orphaned TLDs with HTML tags (offset stability)", () => {
|
|
// This tests the bug where offset is relative to pre-replacement string
|
|
// but we were checking against the mutating result string
|
|
const input = '<a href="http://A.md">link</a> B.md <span title="C.sh">text</span> D.py';
|
|
const result = wrapFileReferencesInHtml(input);
|
|
// A.md in href should NOT be wrapped (inside attribute)
|
|
// B.md outside tags SHOULD be wrapped
|
|
// C.sh in title attribute should NOT be wrapped
|
|
// D.py outside tags SHOULD be wrapped
|
|
expect(result).toContain("<code>B.md</code>");
|
|
expect(result).toContain("<code>D.py</code>");
|
|
expect(result).not.toContain("<code>A.md</code>");
|
|
expect(result).not.toContain("<code>C.sh</code>");
|
|
// Attributes should be unchanged
|
|
expect(result).toContain('href="http://A.md"');
|
|
expect(result).toContain('title="C.sh"');
|
|
});
|
|
});
|