first commit

This commit is contained in:
2025-04-24 13:11:28 +08:00
commit ff9c54d5e4
5960 changed files with 834111 additions and 0 deletions

View File

@@ -0,0 +1,32 @@
# Usage
```
node scripts/learn/checkSanitize/index.mjs https://LEARN_WIKI
```
## Bulk export
There is a bulk export for media wiki pages, but it produces different
html escaping compared to the regular parse API we use in web.
The bulk export does not escape all the placeholder HTML-like elements,
like `<project-id` or `<document goes here>`.
## Example output
Here is how a missing tag gets flagged:
```
---
page : MediaWiki markup for the Overleaf support team
title : MediaWiki markup for the Overleaf support team
match : false
toText : false
text : "Overleaf</strong></td>\n </tr>\n <tr><td>Kb/<strong>TITLE_SLUG</strong></td><td><nowiki>https://www.overleaf.com/learn/how-to/</nowiki><strong>TITLE_SLUG</strong></td>\n </"
sanitized : "Overleaf</strong></td>\n </tr>\n <tr><td>Kb/<strong>TITLE_SLUG</strong></td><td>&lt;nowiki&gt;https://www.overleaf.com/learn/how-to/&lt;/nowiki&gt;<strong>TITLE_SLUG</strong></td>\n "
textToText : " \n \n \n \n MediaWiki page\n Maps to on Overleaf\n \n Kb/TITLE_SLUGhttps://www.overleaf.com/learn/how-to/TITLE_SLUG\n "
sanitizedToText: " \n \n \n \n MediaWiki page\n Maps to on Overleaf\n \n Kb/TITLE_SLUG<nowiki>https://www.overleaf.com/learn/how-to/</nowiki>TITLE"
```
Note the hidden/escaped `<nowiki>` element.
In addition to the side-by-side comparison of HTML you will see a plain-text diff.

View File

@@ -0,0 +1,118 @@
import crypto from 'node:crypto'
import fs from 'node:fs'
import Path from 'node:path'
import cheerio from 'cheerio'
// checkSanitizeOptions is only used in dev env
// eslint-disable-next-line import/no-extraneous-dependencies
import prettier from 'prettier'
import sanitizeHtml from 'sanitize-html'
import { sanitizeOptions } from '../../../modules/learn/app/src/sanitizeOptions.js'
import { fileURLToPath } from 'node:url'
const __dirname = Path.dirname(fileURLToPath(import.meta.url))
const EXTRACT_STYLE = process.env.EXTRACT_STYLES === 'true'
const OMIT_STYLE = process.env.OMIT_STYLE !== 'false'
const DUMP_CSS_IN = Path.join(
Path.dirname(Path.dirname(Path.dirname(__dirname))),
'data',
'dumpFolder'
)
function hash(blob) {
return crypto.createHash('sha1').update(blob).digest('hex')
}
function normalize(blob, title) {
// styles are dropped in web and kept in wiki pages for previewing there.
blob = blob.replace(/<style>(.+?)<\/style>/gs, (_, match) => {
if (EXTRACT_STYLE) {
// normalize css with prettier
const css = prettier.format(match, { parser: 'css' })
fs.writeFileSync(
Path.join(DUMP_CSS_IN, `${hash(css)}-${encodeURIComponent(title)}.css`),
`/* title: ${title} */\n\n${css}`
)
}
if (OMIT_STYLE) {
return ''
}
return match
})
// strip comments:
// - comment at the bottom of each page
blob = blob.replace(/<!-- \nNewPP limit report.+/s, '')
// - annotation of math characters
blob = blob.replace(/<!-- . -->/g, '')
// wrap for consistent rendering
if (blob.indexOf('<html><head>') !== 0) {
blob = `<html><head>${blob}</head></html>`
}
// normalize inline style:
// - drop trailing ;
blob = blob.replace(/style="([^"]+);"/g, (_, style) => `style="${style}"`)
// - normalize whitespace
blob = blob.replace(
/style="([^"]+)"/g,
(_, style) => `style="${style.trim().replace(/([:;])\s+/g, '$1')}"`
)
// let cherrio do another pass
return cheerio.load(blob).html()
}
function toText(blob) {
return cheerio.load(blob).text()
}
const zoomOut = 50
function peak(content, offset) {
// show some more content before/after the mismatch
if (offset > zoomOut) {
offset -= zoomOut
}
// wrap in JSON to escape new line characters
return JSON.stringify(content.slice(offset, offset + chunkSize + 2 * zoomOut))
}
const chunkSize = 100
function findFirstMismatch(a, b) {
if (a === b) return a.length
let i = 0
while (
a.length > chunkSize &&
b.length > chunkSize &&
a.slice(0, chunkSize) === b.slice(0, chunkSize)
) {
i++
a = a.slice(chunkSize)
b = b.slice(chunkSize)
}
return i * chunkSize
}
function checkSanitizeOptions(page, title, text) {
text = normalize(text, title)
const sanitized = normalize(sanitizeHtml(text, sanitizeOptions))
if (text === sanitized) return
const offset = findFirstMismatch(text, sanitized)
const textToText = toText(text)
const sanitizedToText = toText(sanitized)
const offsetText = findFirstMismatch(textToText, sanitizedToText)
console.error('---')
console.error('page :', page)
console.error('title :', title)
console.error('match :', text === sanitized)
console.error('toText :', toText(text) === toText(sanitized))
console.error('text :', peak(text, offset))
console.error('sanitized :', peak(sanitized, offset))
console.error('textToText :', peak(textToText, offsetText))
console.error('sanitizedToText:', peak(sanitizedToText, offsetText))
}
export default checkSanitizeOptions

View File

@@ -0,0 +1,41 @@
import checkSanitizeOptions from './checkSanitizeOptions.mjs'
import Scrape from './scrape.mjs'
import { fileURLToPath } from 'node:url'
const { getAllPagesAndCache, scrapeAndCachePage } = Scrape
async function main() {
const BASE_URL = process.argv.pop()
if (!BASE_URL.startsWith('http')) {
throw new Error(
'Usage: node scripts/learn/checkSanitize/index.mjs https://LEARN_WIKI'
)
}
const pages = await getAllPagesAndCache(BASE_URL)
for (const page of pages) {
try {
const parsed = await scrapeAndCachePage(BASE_URL, page)
const title = parsed.title
const text = parsed.text ? parsed.text['*'] : ''
checkSanitizeOptions(page, title, text)
} catch (e) {
console.error('---')
console.error(page, e)
throw e
}
}
}
if (fileURLToPath(import.meta.url) === process.argv[1]) {
try {
await main()
process.exit(0)
} catch (error) {
console.error(error)
process.exit(1)
}
}

View File

@@ -0,0 +1,130 @@
import Path from 'node:path'
import fs from 'node:fs'
import {
fetchString,
fetchJson,
RequestFailedError,
} from '@overleaf/fetch-utils'
import crypto from 'node:crypto'
import { fileURLToPath } from 'node:url'
const __dirname = Path.dirname(fileURLToPath(import.meta.url))
const CACHE_IN = Path.join(
Path.dirname(Path.dirname(Path.dirname(__dirname))),
'data',
'learnPages'
)
async function scrape(baseUrl, page) {
const uri = new URL(baseUrl + '/learn-scripts/api.php')
uri.search = new URLSearchParams({
page,
action: 'parse',
format: 'json',
redirects: true,
}).toString()
try {
return await fetchString(uri)
} catch (err) {
if (err instanceof RequestFailedError) {
console.error(err.response.status, page, err.response)
} else {
console.error(err)
}
}
}
function hash(blob) {
return crypto.createHash('sha1').update(blob).digest('hex')
}
function getName(page) {
let enc = encodeURIComponent(page)
// There are VERY long titles in media wiki.
// Add percent encoding and they exceed the filename size on my Ubuntu box.
if (enc.length > 100) {
enc = enc.slice(0, 100) + hash(page)
}
return enc
}
async function scrapeAndCachePage(baseUrl, page) {
const path = Path.join(CACHE_IN, getName(page) + '.json')
try {
return JSON.parse(await fs.promises.readFile(path, 'utf-8'))
} catch (e) {
const blob = await scrape(baseUrl, page)
const parsed = JSON.parse(blob).parse
if (!parsed) {
console.error(page, blob)
throw new Error('bad contents')
}
await fs.promises.mkdir(CACHE_IN, { recursive: true })
await fs.promises.writeFile(path, JSON.stringify(parsed, null, 2), 'utf-8')
return parsed
}
}
async function getAllPagesFrom(baseUrl, continueFrom) {
// https://learn.overleaf.com/learn/Special:ApiSandbox#action=query&format=json&generator=allpages&gapfilterredir=nonredirects
const uri = new URL(baseUrl + '/learn-scripts/api.php')
uri.search = new URLSearchParams({
action: 'query',
format: 'json',
generator: 'allpages',
// Ignore pages with redirects. We do not want to check page content twice.
gapfilterredir: 'nonredirects',
// Bump the default page size of 10.
gaplimit: 100,
...continueFrom,
}).toString()
let blob
try {
blob = await fetchJson(uri)
} catch (err) {
if (err instanceof RequestFailedError) {
console.error(err.response.status, continueFrom, err.response)
} else {
console.error(err)
throw err
}
}
const nextContinueFrom = blob && blob.continue
const pagesRaw = (blob && blob.query && blob.query.pages) || {}
const pages = Object.values(pagesRaw).map(page => page.title)
return { nextContinueFrom, pages }
}
async function getAllPages(baseUrl) {
let continueFrom = {}
let allPages = []
while (true) {
const { nextContinueFrom, pages } = await getAllPagesFrom(
baseUrl,
continueFrom
)
allPages = allPages.concat(pages)
if (!nextContinueFrom) break
continueFrom = nextContinueFrom
}
return allPages.sort()
}
async function getAllPagesAndCache(baseUrl) {
const path = Path.join(CACHE_IN, 'allPages.txt')
try {
return JSON.parse(await fs.promises.readFile(path, 'utf-8'))
} catch (e) {
const allPages = await getAllPages(baseUrl)
await fs.promises.mkdir(CACHE_IN, { recursive: true })
await fs.promises.writeFile(path, JSON.stringify(allPages), 'utf-8')
return allPages
}
}
export default {
getAllPagesAndCache,
scrapeAndCachePage,
}