agent-ecosystem · dacharyc · Apr 11, 2026 · Apr 11, 2026
diff --git a/src/checks/content-discoverability/llms-txt-links-markdown.ts b/src/checks/content-discoverability/llms-txt-links-markdown.ts
@@ -1,5 +1,6 @@
 import { registerCheck } from '../registry.js';
 import { extractMarkdownLinks } from './llms-txt-valid.js';
+import { filterByPathPrefix, getPathFilterBase } from '../../helpers/get-page-urls.js';
 import { toMdUrls } from '../../helpers/to-md-urls.js';
 import { looksLikeMarkdown } from '../../helpers/detect-markdown.js';
 import type { CheckContext, CheckResult, DiscoveredFile } from '../../types.js';
@@ -36,35 +37,45 @@ async function checkLlmsTxtLinksMarkdown(ctx: CheckContext): Promise<CheckResult
     };
   }
 
-  // Collect unique links and partition by origin
-  const siteOrigin = ctx.effectiveOrigin ?? ctx.origin;
-  const sameOriginLinks: string[] = [];
-  const crossOriginLinks: string[] = [];
+  // Collect unique links, scope to baseUrl path prefix, and partition by origin
+  const allExtractedUrls = new Set<string>();
   for (const file of discovered) {
     const links = extractMarkdownLinks(file.content);
     for (const link of links) {
       if (link.url.startsWith('http://') || link.url.startsWith('https://')) {
-        try {
-          const linkOrigin = new URL(link.url).origin;
-          if (linkOrigin === siteOrigin) {
-            if (!sameOriginLinks.includes(link.url)) sameOriginLinks.push(link.url);
-          } else {
-            if (!crossOriginLinks.includes(link.url)) crossOriginLinks.push(link.url);
-          }
-        } catch {
-          if (!sameOriginLinks.includes(link.url)) sameOriginLinks.push(link.url);
-        }
+        allExtractedUrls.add(link.url);
+      }
+    }
+  }
+  const scopedUrls = filterByPathPrefix(Array.from(allExtractedUrls), getPathFilterBase(ctx));
+
+  const siteOrigin = ctx.effectiveOrigin ?? ctx.origin;
+  const sameOriginLinks: string[] = [];
+  const crossOriginLinks: string[] = [];
+  for (const url of scopedUrls) {
+    try {
+      const linkOrigin = new URL(url).origin;
+      if (linkOrigin === siteOrigin) {
+        sameOriginLinks.push(url);
+      } else {
+        crossOriginLinks.push(url);
       }
+    } catch {
+      sameOriginLinks.push(url);
     }
   }
 
   const totalLinks = sameOriginLinks.length + crossOriginLinks.length;
   if (totalLinks === 0) {
+    const baseUrlPath = new URL(ctx.baseUrl).pathname.replace(/\/$/, '');
+    const filteredOut = allExtractedUrls.size > 0 && baseUrlPath && baseUrlPath !== '/';
     return {
       id: 'llms-txt-links-markdown',
       category: 'content-discoverability',
       status: 'skip',
-      message: 'No HTTP(S) links found in llms.txt',
+      message: filteredOut
+        ? `llms.txt contains ${allExtractedUrls.size} link${allExtractedUrls.size === 1 ? '' : 's'}, but none are under ${baseUrlPath}`
+        : 'No HTTP(S) links found in llms.txt',
     };
   }
 

diff --git a/src/checks/content-discoverability/llms-txt-links-resolve.ts b/src/checks/content-discoverability/llms-txt-links-resolve.ts
@@ -1,6 +1,7 @@
 import { registerCheck } from '../registry.js';
 import { LINK_RESOLVE_THRESHOLD } from '../../constants.js';
 import { extractMarkdownLinks } from './llms-txt-valid.js';
+import { filterByPathPrefix, getPathFilterBase } from '../../helpers/get-page-urls.js';
 import type { CheckContext, CheckResult, DiscoveredFile } from '../../types.js';
 
 interface LinkCheckResult {
@@ -35,20 +36,28 @@ async function checkLlmsTxtLinksResolve(ctx: CheckContext): Promise<CheckResult>
     }
   }
 
-  if (allLinks.size === 0) {
+  // Scope links to the baseUrl path prefix so that docs at a subpath
+  // (e.g. /docs) don't include unrelated site content from root llms.txt.
+  const scopedUrls = filterByPathPrefix(Array.from(allLinks.keys()), getPathFilterBase(ctx));
+
+  if (scopedUrls.length === 0) {
+    const baseUrlPath = new URL(ctx.baseUrl).pathname.replace(/\/$/, '');
+    const filteredOut = allLinks.size > 0 && baseUrlPath && baseUrlPath !== '/';
     return {
       id: 'llms-txt-links-resolve',
       category: 'content-discoverability',
       status: 'skip',
-      message: 'No HTTP(S) links found in llms.txt',
+      message: filteredOut
+        ? `llms.txt contains ${allLinks.size} link${allLinks.size === 1 ? '' : 's'}, but none are under ${baseUrlPath}`
+        : 'No HTTP(S) links found in llms.txt',
     };
   }
 
   // Partition links into same-origin and cross-origin
   const siteOrigin = ctx.effectiveOrigin ?? ctx.origin;
   const sameOriginLinks: string[] = [];
   const crossOriginLinks: string[] = [];
-  for (const url of allLinks.keys()) {
+  for (const url of scopedUrls) {
     try {
       const linkOrigin = new URL(url).origin;
       if (linkOrigin === siteOrigin) {

diff --git a/src/helpers/get-page-urls.ts b/src/helpers/get-page-urls.ts
@@ -309,30 +309,75 @@ export async function getUrlsFromSitemap(
   return urls;
 }
 
+/**
+ * Get the base URL for path-prefix filtering, accounting for cross-host redirects.
+ *
+ * When a cross-host redirect is in play (e.g. example.com/docs → docs.example.com),
+ * the original baseUrl path doesn't apply to the redirected host, so we return the
+ * effectiveOrigin (a root URL) which makes path filtering a no-op.
+ */
+export function getPathFilterBase(ctx: CheckContext): string {
+  return ctx.effectiveOrigin && ctx.effectiveOrigin !== ctx.origin
+    ? ctx.effectiveOrigin
+    : ctx.baseUrl;
+}
+
+/**
+ * Filter URLs to those under the baseUrl's path prefix.
+ *
+ * When the input URL has a non-root path (e.g. `https://plaid.com/docs`),
+ * only URLs whose pathname starts with that prefix are kept. This prevents
+ * blog posts, marketing pages, and other non-docs content from polluting
+ * the URL pool when llms.txt or sitemaps cover the entire domain.
+ *
+ * Root URLs (path is `/`) pass all same-origin URLs through unfiltered.
+ */
+export function filterByPathPrefix(urls: string[], baseUrl: string): string[] {
+  const baseUrlPath = new URL(baseUrl).pathname.replace(/\/$/, '');
+  if (!baseUrlPath || baseUrlPath === '') return urls;
+
+  return urls.filter((url) => {
+    try {
+      const parsed = new URL(url);
+      return parsed.pathname === baseUrlPath || parsed.pathname.startsWith(baseUrlPath + '/');
+    } catch {
+      return true; // keep malformed URLs rather than silently dropping them
+    }
+  });
+}
+
 /**
  * Discover page URLs from llms.txt links, sitemap, or fall back to baseUrl.
  *
  * Priority:
  * 1. llms.txt links (from previous check results)
  * 2. Sitemap URLs (robots.txt Sitemap directives, then /sitemap.xml fallback)
  * 3. baseUrl fallback
+ *
+ * All discovered URLs are filtered to the baseUrl's path prefix so that
+ * docs at a subpath (e.g. `/docs`) don't include unrelated site content.
  */
 export async function getPageUrls(ctx: CheckContext): Promise<PageUrlResult> {
   const warnings: string[] = [];
 
+  const filterBase = getPathFilterBase(ctx);
+
   // 1. Try llms.txt links from cached results (if llms-txt-exists ran)
   const cachedUrls = await getUrlsFromCachedLlmsTxt(ctx);
-  if (cachedUrls.length > 0) return { urls: cachedUrls, warnings };
+  const scopedCachedUrls = filterByPathPrefix(cachedUrls, filterBase);
+  if (scopedCachedUrls.length > 0) return { urls: scopedCachedUrls, warnings };
 
   // 2. Try fetching llms.txt directly (standalone mode, llms-txt-exists didn't run)
   if (!ctx.previousResults.has('llms-txt-exists')) {
     const fetchedUrls = await fetchLlmsTxtUrls(ctx);
-    if (fetchedUrls.length > 0) return { urls: fetchedUrls, warnings };
+    const scopedFetchedUrls = filterByPathPrefix(fetchedUrls, filterBase);
+    if (scopedFetchedUrls.length > 0) return { urls: scopedFetchedUrls, warnings };
   }
 
   // 3. Try sitemap
   const sitemapUrls = await getUrlsFromSitemap(ctx, warnings);
-  if (sitemapUrls.length > 0) return { urls: sitemapUrls, warnings };
+  const scopedSitemapUrls = filterByPathPrefix(sitemapUrls, filterBase);
+  if (scopedSitemapUrls.length > 0) return { urls: scopedSitemapUrls, warnings };
 
   // 4. Fallback
   return { urls: [ctx.baseUrl], warnings };

diff --git a/test/unit/checks/llms-txt-links-markdown.test.ts b/test/unit/checks/llms-txt-links-markdown.test.ts
@@ -263,6 +263,50 @@ Just text, no links here.
     expect(result.status).toBe('fail');
   });
 
+  // ── Path-prefix scoping ──
+
+  it('scopes links to baseUrl path prefix', async () => {
+    // llms.txt has both docs and non-docs links; only docs links should be tested
+    const content = `# Site\n- [Intro](http://scope-md.local/docs/intro.md): Intro\n- [Blog](http://scope-md.local/blog/post.md): Blog\n- [Careers](http://scope-md.local/careers): Careers\n`;
+    const ctx = createContext('http://scope-md.local/docs', { requestDelay: 0 });
+    const discovered: DiscoveredFile[] = [
+      { url: 'http://scope-md.local/llms.txt', content, status: 200, redirected: false },
+    ];
+    ctx.previousResults.set('llms-txt-exists', {
+      id: 'llms-txt-exists',
+      category: 'content-discoverability',
+      status: 'pass',
+      message: 'Found',
+      details: { discoveredFiles: discovered },
+    });
+
+    const result = await check.run(ctx);
+    expect(result.status).toBe('pass');
+    // Only the /docs/intro.md link should be tested (markdownRate 100%)
+    expect(result.details?.testedLinks).toBe(1);
+    expect(result.details?.markdownRate).toBe(100);
+  });
+
+  it('skips with descriptive message when all links are outside the baseUrl path prefix', async () => {
+    const content = `# Site\n- [Blog](http://scope-md2.local/blog/post): Blog\n`;
+    const ctx = createContext('http://scope-md2.local/docs', { requestDelay: 0 });
+    const discovered: DiscoveredFile[] = [
+      { url: 'http://scope-md2.local/llms.txt', content, status: 200, redirected: false },
+    ];
+    ctx.previousResults.set('llms-txt-exists', {
+      id: 'llms-txt-exists',
+      category: 'content-discoverability',
+      status: 'pass',
+      message: 'Found',
+      details: { discoveredFiles: discovered },
+    });
+
+    const result = await check.run(ctx);
+    expect(result.status).toBe('skip');
+    expect(result.message).toContain('1 link');
+    expect(result.message).toContain('none are under /docs');
+  });
+
   it('uses toMdUrls to find .md variants (handles trailing slash and .html)', async () => {
     server.use(
       http.head(

diff --git a/test/unit/checks/llms-txt-links-resolve.test.ts b/test/unit/checks/llms-txt-links-resolve.test.ts
@@ -194,6 +194,53 @@ Just text, no links.
     expect(result.message).toContain('rate-limited (HTTP 429)');
   });
 
+  // ── Path-prefix scoping ──
+
+  it('scopes links to baseUrl path prefix', async () => {
+    server.use(
+      http.head('http://scope-res.local/docs/page1', () => new HttpResponse(null, { status: 200 })),
+    );
+
+    // llms.txt has both docs and non-docs links; only docs links should be tested
+    const content = `# Site\n- [Page](http://scope-res.local/docs/page1): Page\n- [Blog](http://scope-res.local/blog/post): Blog\n`;
+    const ctx = createContext('http://scope-res.local/docs', { requestDelay: 0 });
+    const discovered: DiscoveredFile[] = [
+      { url: 'http://scope-res.local/llms.txt', content, status: 200, redirected: false },
+    ];
+    ctx.previousResults.set('llms-txt-exists', {
+      id: 'llms-txt-exists',
+      category: 'content-discoverability',
+      status: 'pass',
+      message: 'Found',
+      details: { discoveredFiles: discovered },
+    });
+
+    const result = await check.run(ctx);
+    expect(result.status).toBe('pass');
+    // Only the /docs/page1 link should be tested
+    expect(result.details?.sameOrigin).toMatchObject({ tested: 1, resolved: 1 });
+  });
+
+  it('skips with descriptive message when all links are outside the baseUrl path prefix', async () => {
+    const content = `# Site\n- [Blog](http://scope-res2.local/blog/post): Blog\n`;
+    const ctx = createContext('http://scope-res2.local/docs', { requestDelay: 0 });
+    const discovered: DiscoveredFile[] = [
+      { url: 'http://scope-res2.local/llms.txt', content, status: 200, redirected: false },
+    ];
+    ctx.previousResults.set('llms-txt-exists', {
+      id: 'llms-txt-exists',
+      category: 'content-discoverability',
+      status: 'pass',
+      message: 'Found',
+      details: { discoveredFiles: discovered },
+    });
+
+    const result = await check.run(ctx);
+    expect(result.status).toBe('skip');
+    expect(result.message).toContain('1 link');
+    expect(result.message).toContain('none are under /docs');
+  });
+
   it('includes "sampled" in message when results are sampled', async () => {
     const links = Array.from(
       { length: 5 },