Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
41 changes: 26 additions & 15 deletions src/checks/content-discoverability/llms-txt-links-markdown.ts
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import { registerCheck } from '../registry.js';
import { extractMarkdownLinks } from './llms-txt-valid.js';
import { filterByPathPrefix, getPathFilterBase } from '../../helpers/get-page-urls.js';
import { toMdUrls } from '../../helpers/to-md-urls.js';
import { looksLikeMarkdown } from '../../helpers/detect-markdown.js';
import type { CheckContext, CheckResult, DiscoveredFile } from '../../types.js';
Expand Down Expand Up @@ -36,35 +37,45 @@ async function checkLlmsTxtLinksMarkdown(ctx: CheckContext): Promise<CheckResult
};
}

// Collect unique links and partition by origin
const siteOrigin = ctx.effectiveOrigin ?? ctx.origin;
const sameOriginLinks: string[] = [];
const crossOriginLinks: string[] = [];
// Collect unique links, scope to baseUrl path prefix, and partition by origin
const allExtractedUrls = new Set<string>();
for (const file of discovered) {
const links = extractMarkdownLinks(file.content);
for (const link of links) {
if (link.url.startsWith('http://') || link.url.startsWith('https://')) {
try {
const linkOrigin = new URL(link.url).origin;
if (linkOrigin === siteOrigin) {
if (!sameOriginLinks.includes(link.url)) sameOriginLinks.push(link.url);
} else {
if (!crossOriginLinks.includes(link.url)) crossOriginLinks.push(link.url);
}
} catch {
if (!sameOriginLinks.includes(link.url)) sameOriginLinks.push(link.url);
}
allExtractedUrls.add(link.url);
}
}
}
const scopedUrls = filterByPathPrefix(Array.from(allExtractedUrls), getPathFilterBase(ctx));

const siteOrigin = ctx.effectiveOrigin ?? ctx.origin;
const sameOriginLinks: string[] = [];
const crossOriginLinks: string[] = [];
for (const url of scopedUrls) {
try {
const linkOrigin = new URL(url).origin;
if (linkOrigin === siteOrigin) {
sameOriginLinks.push(url);
} else {
crossOriginLinks.push(url);
}
} catch {
sameOriginLinks.push(url);
}
}

const totalLinks = sameOriginLinks.length + crossOriginLinks.length;
if (totalLinks === 0) {
const baseUrlPath = new URL(ctx.baseUrl).pathname.replace(/\/$/, '');
const filteredOut = allExtractedUrls.size > 0 && baseUrlPath && baseUrlPath !== '/';
return {
id: 'llms-txt-links-markdown',
category: 'content-discoverability',
status: 'skip',
message: 'No HTTP(S) links found in llms.txt',
message: filteredOut
? `llms.txt contains ${allExtractedUrls.size} link${allExtractedUrls.size === 1 ? '' : 's'}, but none are under ${baseUrlPath}`
: 'No HTTP(S) links found in llms.txt',
};
}

Expand Down
15 changes: 12 additions & 3 deletions src/checks/content-discoverability/llms-txt-links-resolve.ts
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import { registerCheck } from '../registry.js';
import { LINK_RESOLVE_THRESHOLD } from '../../constants.js';
import { extractMarkdownLinks } from './llms-txt-valid.js';
import { filterByPathPrefix, getPathFilterBase } from '../../helpers/get-page-urls.js';
import type { CheckContext, CheckResult, DiscoveredFile } from '../../types.js';

interface LinkCheckResult {
Expand Down Expand Up @@ -35,20 +36,28 @@ async function checkLlmsTxtLinksResolve(ctx: CheckContext): Promise<CheckResult>
}
}

if (allLinks.size === 0) {
// Scope links to the baseUrl path prefix so that docs at a subpath
// (e.g. /docs) don't include unrelated site content from root llms.txt.
const scopedUrls = filterByPathPrefix(Array.from(allLinks.keys()), getPathFilterBase(ctx));

if (scopedUrls.length === 0) {
const baseUrlPath = new URL(ctx.baseUrl).pathname.replace(/\/$/, '');
const filteredOut = allLinks.size > 0 && baseUrlPath && baseUrlPath !== '/';
return {
id: 'llms-txt-links-resolve',
category: 'content-discoverability',
status: 'skip',
message: 'No HTTP(S) links found in llms.txt',
message: filteredOut
? `llms.txt contains ${allLinks.size} link${allLinks.size === 1 ? '' : 's'}, but none are under ${baseUrlPath}`
: 'No HTTP(S) links found in llms.txt',
};
}

// Partition links into same-origin and cross-origin
const siteOrigin = ctx.effectiveOrigin ?? ctx.origin;
const sameOriginLinks: string[] = [];
const crossOriginLinks: string[] = [];
for (const url of allLinks.keys()) {
for (const url of scopedUrls) {
try {
const linkOrigin = new URL(url).origin;
if (linkOrigin === siteOrigin) {
Expand Down
51 changes: 48 additions & 3 deletions src/helpers/get-page-urls.ts
Original file line number Diff line number Diff line change
Expand Up @@ -309,30 +309,75 @@ export async function getUrlsFromSitemap(
return urls;
}

/**
* Get the base URL for path-prefix filtering, accounting for cross-host redirects.
*
* When a cross-host redirect is in play (e.g. example.com/docs → docs.example.com),
* the original baseUrl path doesn't apply to the redirected host, so we return the
* effectiveOrigin (a root URL) which makes path filtering a no-op.
*/
export function getPathFilterBase(ctx: CheckContext): string {
return ctx.effectiveOrigin && ctx.effectiveOrigin !== ctx.origin
? ctx.effectiveOrigin
: ctx.baseUrl;
}

/**
* Filter URLs to those under the baseUrl's path prefix.
*
* When the input URL has a non-root path (e.g. `https://plaid.com/docs`),
* only URLs whose pathname starts with that prefix are kept. This prevents
* blog posts, marketing pages, and other non-docs content from polluting
* the URL pool when llms.txt or sitemaps cover the entire domain.
*
* Root URLs (path is `/`) pass all same-origin URLs through unfiltered.
*/
export function filterByPathPrefix(urls: string[], baseUrl: string): string[] {
const baseUrlPath = new URL(baseUrl).pathname.replace(/\/$/, '');
if (!baseUrlPath || baseUrlPath === '') return urls;

return urls.filter((url) => {
try {
const parsed = new URL(url);
return parsed.pathname === baseUrlPath || parsed.pathname.startsWith(baseUrlPath + '/');
} catch {
return true; // keep malformed URLs rather than silently dropping them
}
});
}

/**
* Discover page URLs from llms.txt links, sitemap, or fall back to baseUrl.
*
* Priority:
* 1. llms.txt links (from previous check results)
* 2. Sitemap URLs (robots.txt Sitemap directives, then /sitemap.xml fallback)
* 3. baseUrl fallback
*
* All discovered URLs are filtered to the baseUrl's path prefix so that
* docs at a subpath (e.g. `/docs`) don't include unrelated site content.
*/
export async function getPageUrls(ctx: CheckContext): Promise<PageUrlResult> {
const warnings: string[] = [];

const filterBase = getPathFilterBase(ctx);

// 1. Try llms.txt links from cached results (if llms-txt-exists ran)
const cachedUrls = await getUrlsFromCachedLlmsTxt(ctx);
if (cachedUrls.length > 0) return { urls: cachedUrls, warnings };
const scopedCachedUrls = filterByPathPrefix(cachedUrls, filterBase);
if (scopedCachedUrls.length > 0) return { urls: scopedCachedUrls, warnings };

// 2. Try fetching llms.txt directly (standalone mode, llms-txt-exists didn't run)
if (!ctx.previousResults.has('llms-txt-exists')) {
const fetchedUrls = await fetchLlmsTxtUrls(ctx);
if (fetchedUrls.length > 0) return { urls: fetchedUrls, warnings };
const scopedFetchedUrls = filterByPathPrefix(fetchedUrls, filterBase);
if (scopedFetchedUrls.length > 0) return { urls: scopedFetchedUrls, warnings };
}

// 3. Try sitemap
const sitemapUrls = await getUrlsFromSitemap(ctx, warnings);
if (sitemapUrls.length > 0) return { urls: sitemapUrls, warnings };
const scopedSitemapUrls = filterByPathPrefix(sitemapUrls, filterBase);
if (scopedSitemapUrls.length > 0) return { urls: scopedSitemapUrls, warnings };

// 4. Fallback
return { urls: [ctx.baseUrl], warnings };
Expand Down
44 changes: 44 additions & 0 deletions test/unit/checks/llms-txt-links-markdown.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -263,6 +263,50 @@ Just text, no links here.
expect(result.status).toBe('fail');
});

// ── Path-prefix scoping ──

it('scopes links to baseUrl path prefix', async () => {
// llms.txt has both docs and non-docs links; only docs links should be tested
const content = `# Site\n- [Intro](http://scope-md.local/docs/intro.md): Intro\n- [Blog](http://scope-md.local/blog/post.md): Blog\n- [Careers](http://scope-md.local/careers): Careers\n`;
const ctx = createContext('http://scope-md.local/docs', { requestDelay: 0 });
const discovered: DiscoveredFile[] = [
{ url: 'http://scope-md.local/llms.txt', content, status: 200, redirected: false },
];
ctx.previousResults.set('llms-txt-exists', {
id: 'llms-txt-exists',
category: 'content-discoverability',
status: 'pass',
message: 'Found',
details: { discoveredFiles: discovered },
});

const result = await check.run(ctx);
expect(result.status).toBe('pass');
// Only the /docs/intro.md link should be tested (markdownRate 100%)
expect(result.details?.testedLinks).toBe(1);
expect(result.details?.markdownRate).toBe(100);
});

it('skips with descriptive message when all links are outside the baseUrl path prefix', async () => {
const content = `# Site\n- [Blog](http://scope-md2.local/blog/post): Blog\n`;
const ctx = createContext('http://scope-md2.local/docs', { requestDelay: 0 });
const discovered: DiscoveredFile[] = [
{ url: 'http://scope-md2.local/llms.txt', content, status: 200, redirected: false },
];
ctx.previousResults.set('llms-txt-exists', {
id: 'llms-txt-exists',
category: 'content-discoverability',
status: 'pass',
message: 'Found',
details: { discoveredFiles: discovered },
});

const result = await check.run(ctx);
expect(result.status).toBe('skip');
expect(result.message).toContain('1 link');
expect(result.message).toContain('none are under /docs');
});

it('uses toMdUrls to find .md variants (handles trailing slash and .html)', async () => {
server.use(
http.head(
Expand Down
47 changes: 47 additions & 0 deletions test/unit/checks/llms-txt-links-resolve.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -194,6 +194,53 @@ Just text, no links.
expect(result.message).toContain('rate-limited (HTTP 429)');
});

// ── Path-prefix scoping ──

it('scopes links to baseUrl path prefix', async () => {
server.use(
http.head('http://scope-res.local/docs/page1', () => new HttpResponse(null, { status: 200 })),
);

// llms.txt has both docs and non-docs links; only docs links should be tested
const content = `# Site\n- [Page](http://scope-res.local/docs/page1): Page\n- [Blog](http://scope-res.local/blog/post): Blog\n`;
const ctx = createContext('http://scope-res.local/docs', { requestDelay: 0 });
const discovered: DiscoveredFile[] = [
{ url: 'http://scope-res.local/llms.txt', content, status: 200, redirected: false },
];
ctx.previousResults.set('llms-txt-exists', {
id: 'llms-txt-exists',
category: 'content-discoverability',
status: 'pass',
message: 'Found',
details: { discoveredFiles: discovered },
});

const result = await check.run(ctx);
expect(result.status).toBe('pass');
// Only the /docs/page1 link should be tested
expect(result.details?.sameOrigin).toMatchObject({ tested: 1, resolved: 1 });
});

it('skips with descriptive message when all links are outside the baseUrl path prefix', async () => {
const content = `# Site\n- [Blog](http://scope-res2.local/blog/post): Blog\n`;
const ctx = createContext('http://scope-res2.local/docs', { requestDelay: 0 });
const discovered: DiscoveredFile[] = [
{ url: 'http://scope-res2.local/llms.txt', content, status: 200, redirected: false },
];
ctx.previousResults.set('llms-txt-exists', {
id: 'llms-txt-exists',
category: 'content-discoverability',
status: 'pass',
message: 'Found',
details: { discoveredFiles: discovered },
});

const result = await check.run(ctx);
expect(result.status).toBe('skip');
expect(result.message).toContain('1 link');
expect(result.message).toContain('none are under /docs');
});

it('includes "sampled" in message when results are sampled', async () => {
const links = Array.from(
{ length: 5 },
Expand Down
Loading