AI schema generators sometimes output relative URLs, broken targets, or URLs from staging domains. These break rich-result eligibility and confuse AI engines that follow URLs to fetch related entities. The fix: validate every URL field, ensure all are absolute and resolvable, and align with your canonical URLs.
Every schema type has URL-typed properties. Common ones:
"@id" → unique identifier (URL form) "url" → primary URL of the entity "mainEntityOfPage" → page where entity is primarily described "image" → image URL or ImageObject "logo" → organisation logo URL "sameAs" → array of external profile URLs "author.url" → author's profile page "publisher.url" → publisher homepage "offers.url" → buy/checkout URL "contentUrl" → direct file URL (for MediaObject) "embedUrl" → iframe-embeddable URL (for VideoObject) "telephone" → not URL but treated similarly
{
"@type": "Product",
"name": "Widget",
"image": "/images/widget.jpg", ❌ relative
"url": "/products/widget", ❌ relative
"offers": {
"url": "/products/widget#buy" ❌ relative
}
}
{
"url": "https://staging.example.com/products/widget" ❌ staging
}
{
"image": "http://localhost:3000/images/widget.jpg" ❌ localhost
}
{
"image": "//cdn.example.com/widget.jpg" ❌ protocol-relative
}
<!-- Worked in browsers; fails in schema parsers -->
{
"url": "https://example.com/products/widget?utm_source=schema" ❌ tracked
}
<!-- Schema URL should be canonical, no UTM -->
{
"url": "https://example.com/old-product-url" ❌ 301 redirect
}
{
"image": "https://example.com/deleted-image.jpg" ❌ 404
}
{
"@context": "https://schema.org",
"@type": "Product",
"@id": "https://example.com/products/widget#product",
"name": "Blue Widget",
"url": "https://example.com/products/widget",
"image": [
"https://example.com/images/widget-1200x630.jpg",
"https://example.com/images/widget-1080x1080.jpg"
],
"mainEntityOfPage": "https://example.com/products/widget",
"offers": {
"@type": "Offer",
"url": "https://example.com/products/widget#buy",
"price": "29.00",
"priceCurrency": "GBP"
},
"brand": {
"@type": "Brand",
"name": "Acme",
"url": "https://example.com/brand/acme"
}
}
All URLs absolute, HTTPS, canonical (no UTM), pointing at the live domain.
If your AI schema generator outputs the wrong URLs, the fix is usually at config-time, not post-edit:
// Pass base URL to generator
const schema = await generateSchema({
content: pageContent,
baseUrl: 'https://example.com', // ← always set
currentUrl: req.canonicalUrl, // ← the page being annotated
environment: 'production' // ← strip staging URLs
});
// Generator can then resolve all relative URLs from baseUrl
// and validate that produced URLs match production domain
// Run this on every generator output before deploy
function validateSchemaUrls(schema, expectedDomain) {
const errors = [];
function walk(obj, path = '') {
if (typeof obj !== 'object' || obj === null) return;
for (const [key, value] of Object.entries(obj)) {
const p = path ? `${path}.${key}` : key;
if (typeof value === 'string' && looksLikeUrlField(key)) {
// Must be absolute
if (!value.startsWith('http://') && !value.startsWith('https://')) {
errors.push(`${p}: relative URL "${value}"`);
}
// Must be HTTPS
if (value.startsWith('http://')) {
errors.push(`${p}: HTTP not HTTPS "${value}"`);
}
// Must be production domain
if (!value.includes(expectedDomain)) {
errors.push(`${p}: wrong domain "${value}"`);
}
} else if (Array.isArray(value) || typeof value === 'object') {
walk(value, p);
}
}
}
walk(schema);
return errors;
}
const URL_FIELDS = ['url', 'image', 'logo', 'sameAs', 'contentUrl', 'embedUrl', '@id'];
function looksLikeUrlField(key) {
return URL_FIELDS.includes(key) || key.endsWith('Url');
}
# From a deployed page
curl -s https://example.com/products/widget | \
python3 -c "
import json, re, sys
html = sys.stdin.read()
for m in re.finditer(r'<script type=\"application/ld\+json\">(.+?)</script>', html, re.DOTALL):
data = json.loads(m.group(1))
# Walk and print every URL value
def walk(o, path=''):
if isinstance(o, dict):
for k,v in o.items():
walk(v, f'{path}.{k}')
elif isinstance(o, list):
for i,v in enumerate(o):
walk(v, f'{path}[{i}]')
elif isinstance(o, str) and o.startswith('http'):
print(f'{path}: {o}')
walk(data)
"
# Extract URLs, curl each, report non-200
while read url; do
code=$(curl -s -o /dev/null -w "%{http_code}" "$url")
[ "$code" != "200" ] && echo "BAD $code $url"
done < schema-urls.txt
Make URL validation part of your build. Failing schema URLs should block deploy:
# .github/workflows/schema-check.yml
- name: Validate schema URLs
run: |
npm run build
node scripts/extract-schema-urls.js > urls.txt
while read url; do
code=$(curl -sIo /dev/null -w "%{http_code}" "$url")
if [ "$code" != "200" ]; then
echo "::error::Schema URL broken: $code $url"
exit 1
fi
done < urls.txt