first commit

2025-04-24 13:11:28 +08:00
commit ff9c54d5e4
5960 changed files with 834111 additions and 0 deletions
--- a/services/web/scripts/learn/checkSanitize/README.md
+++ b/services/web/scripts/learn/checkSanitize/README.md
@@ -0,0 +1,32 @@
+# Usage
+
+```
+node scripts/learn/checkSanitize/index.mjs https://LEARN_WIKI
+```
+
+## Bulk export
+
+There is a bulk export for media wiki pages, but it produces different
+ html escaping compared to the regular parse API we use in web.
+
+The bulk export does not escape all the placeholder HTML-like elements,
+ like `<project-id` or `<document goes here>`.
+
+## Example output
+
+Here is how a missing tag gets flagged:
+
+```
+---
+page           : MediaWiki markup for the Overleaf support team
+title          : MediaWiki markup for the Overleaf support team
+match          : false
+toText         : false
+text           : "Overleaf</strong></td>\n            </tr>\n           <tr><td>Kb/<strong>TITLE_SLUG</strong></td><td><nowiki>https://www.overleaf.com/learn/how-to/</nowiki><strong>TITLE_SLUG</strong></td>\n           </"
+sanitized      : "Overleaf</strong></td>\n            </tr>\n           <tr><td>Kb/<strong>TITLE_SLUG</strong></td><td>&lt;nowiki&gt;https://www.overleaf.com/learn/how-to/&lt;/nowiki&gt;<strong>TITLE_SLUG</strong></td>\n "
+textToText     : "    \n        \n        \n            \n                MediaWiki page\n                Maps to on Overleaf\n            \n           Kb/TITLE_SLUGhttps://www.overleaf.com/learn/how-to/TITLE_SLUG\n           "
+sanitizedToText: "    \n        \n        \n            \n                MediaWiki page\n                Maps to on Overleaf\n            \n           Kb/TITLE_SLUG<nowiki>https://www.overleaf.com/learn/how-to/</nowiki>TITLE"
+```
+
+Note the hidden/escaped `<nowiki>` element.
+In addition to the side-by-side comparison of HTML you will see a plain-text diff.
--- a/services/web/scripts/learn/checkSanitize/checkSanitizeOptions.mjs
+++ b/services/web/scripts/learn/checkSanitize/checkSanitizeOptions.mjs
@@ -0,0 +1,118 @@
+import crypto from 'node:crypto'
+import fs from 'node:fs'
+import Path from 'node:path'
+import cheerio from 'cheerio'
+// checkSanitizeOptions is only used in dev env
+// eslint-disable-next-line import/no-extraneous-dependencies
+import prettier from 'prettier'
+import sanitizeHtml from 'sanitize-html'
+import { sanitizeOptions } from '../../../modules/learn/app/src/sanitizeOptions.js'
+import { fileURLToPath } from 'node:url'
+
+const __dirname = Path.dirname(fileURLToPath(import.meta.url))
+const EXTRACT_STYLE = process.env.EXTRACT_STYLES === 'true'
+const OMIT_STYLE = process.env.OMIT_STYLE !== 'false'
+const DUMP_CSS_IN = Path.join(
+  Path.dirname(Path.dirname(Path.dirname(__dirname))),
+  'data',
+  'dumpFolder'
+)
+
+function hash(blob) {
+  return crypto.createHash('sha1').update(blob).digest('hex')
+}
+
+function normalize(blob, title) {
+  // styles are dropped in web and kept in wiki pages for previewing there.
+  blob = blob.replace(/<style>(.+?)<\/style>/gs, (_, match) => {
+    if (EXTRACT_STYLE) {
+      // normalize css with prettier
+      const css = prettier.format(match, { parser: 'css' })
+      fs.writeFileSync(
+        Path.join(DUMP_CSS_IN, `${hash(css)}-${encodeURIComponent(title)}.css`),
+        `/* title: ${title} */\n\n${css}`
+      )
+    }
+    if (OMIT_STYLE) {
+      return ''
+    }
+    return match
+  })
+
+  // strip comments:
+  // - comment at the bottom of each page
+  blob = blob.replace(/<!-- \nNewPP limit report.+/s, '')
+  // - annotation of math characters
+  blob = blob.replace(/<!-- . -->/g, '')
+
+  // wrap for consistent rendering
+  if (blob.indexOf('<html><head>') !== 0) {
+    blob = `<html><head>${blob}</head></html>`
+  }
+
+  // normalize inline style:
+  // - drop trailing ;
+  blob = blob.replace(/style="([^"]+);"/g, (_, style) => `style="${style}"`)
+  // - normalize whitespace
+  blob = blob.replace(
+    /style="([^"]+)"/g,
+    (_, style) => `style="${style.trim().replace(/([:;])\s+/g, '$1')}"`
+  )
+
+  // let cherrio do another pass
+  return cheerio.load(blob).html()
+}
+
+function toText(blob) {
+  return cheerio.load(blob).text()
+}
+
+const zoomOut = 50
+function peak(content, offset) {
+  // show some more content before/after the mismatch
+  if (offset > zoomOut) {
+    offset -= zoomOut
+  }
+  // wrap in JSON to escape new line characters
+  return JSON.stringify(content.slice(offset, offset + chunkSize + 2 * zoomOut))
+}
+
+const chunkSize = 100
+function findFirstMismatch(a, b) {
+  if (a === b) return a.length
+  let i = 0
+  while (
+    a.length > chunkSize &&
+    b.length > chunkSize &&
+    a.slice(0, chunkSize) === b.slice(0, chunkSize)
+  ) {
+    i++
+    a = a.slice(chunkSize)
+    b = b.slice(chunkSize)
+  }
+  return i * chunkSize
+}
+
+function checkSanitizeOptions(page, title, text) {
+  text = normalize(text, title)
+  const sanitized = normalize(sanitizeHtml(text, sanitizeOptions))
+  if (text === sanitized) return
+
+  const offset = findFirstMismatch(text, sanitized)
+
+  const textToText = toText(text)
+  const sanitizedToText = toText(sanitized)
+  const offsetText = findFirstMismatch(textToText, sanitizedToText)
+
+  console.error('---')
+  console.error('page           :', page)
+  console.error('title          :', title)
+  console.error('match          :', text === sanitized)
+  console.error('toText         :', toText(text) === toText(sanitized))
+  console.error('text           :', peak(text, offset))
+  console.error('sanitized      :', peak(sanitized, offset))
+  console.error('textToText     :', peak(textToText, offsetText))
+  console.error('sanitizedToText:', peak(sanitizedToText, offsetText))
+}
+
+export default checkSanitizeOptions
--- a/services/web/scripts/learn/checkSanitize/index.mjs
+++ b/services/web/scripts/learn/checkSanitize/index.mjs
@@ -0,0 +1,41 @@
+import checkSanitizeOptions from './checkSanitizeOptions.mjs'
+import Scrape from './scrape.mjs'
+import { fileURLToPath } from 'node:url'
+
+const { getAllPagesAndCache, scrapeAndCachePage } = Scrape
+
+async function main() {
+  const BASE_URL = process.argv.pop()
+  if (!BASE_URL.startsWith('http')) {
+    throw new Error(
+      'Usage: node scripts/learn/checkSanitize/index.mjs https://LEARN_WIKI'
+    )
+  }
+
+  const pages = await getAllPagesAndCache(BASE_URL)
+
+  for (const page of pages) {
+    try {
+      const parsed = await scrapeAndCachePage(BASE_URL, page)
+
+      const title = parsed.title
+      const text = parsed.text ? parsed.text['*'] : ''
+
+      checkSanitizeOptions(page, title, text)
+    } catch (e) {
+      console.error('---')
+      console.error(page, e)
+      throw e
+    }
+  }
+}
+
+if (fileURLToPath(import.meta.url) === process.argv[1]) {
+  try {
+    await main()
+    process.exit(0)
+  } catch (error) {
+    console.error(error)
+    process.exit(1)
+  }
+}
--- a/services/web/scripts/learn/checkSanitize/scrape.mjs
+++ b/services/web/scripts/learn/checkSanitize/scrape.mjs
@@ -0,0 +1,130 @@
+import Path from 'node:path'
+import fs from 'node:fs'
+import {
+  fetchString,
+  fetchJson,
+  RequestFailedError,
+} from '@overleaf/fetch-utils'
+import crypto from 'node:crypto'
+import { fileURLToPath } from 'node:url'
+
+const __dirname = Path.dirname(fileURLToPath(import.meta.url))
+const CACHE_IN = Path.join(
+  Path.dirname(Path.dirname(Path.dirname(__dirname))),
+  'data',
+  'learnPages'
+)
+
+async function scrape(baseUrl, page) {
+  const uri = new URL(baseUrl + '/learn-scripts/api.php')
+  uri.search = new URLSearchParams({
+    page,
+    action: 'parse',
+    format: 'json',
+    redirects: true,
+  }).toString()
+
+  try {
+    return await fetchString(uri)
+  } catch (err) {
+    if (err instanceof RequestFailedError) {
+      console.error(err.response.status, page, err.response)
+    } else {
+      console.error(err)
+    }
+  }
+}
+
+function hash(blob) {
+  return crypto.createHash('sha1').update(blob).digest('hex')
+}
+
+function getName(page) {
+  let enc = encodeURIComponent(page)
+  // There are VERY long titles in media wiki.
+  // Add percent encoding and they exceed the filename size on my Ubuntu box.
+  if (enc.length > 100) {
+    enc = enc.slice(0, 100) + hash(page)
+  }
+  return enc
+}
+
+async function scrapeAndCachePage(baseUrl, page) {
+  const path = Path.join(CACHE_IN, getName(page) + '.json')
+  try {
+    return JSON.parse(await fs.promises.readFile(path, 'utf-8'))
+  } catch (e) {
+    const blob = await scrape(baseUrl, page)
+    const parsed = JSON.parse(blob).parse
+    if (!parsed) {
+      console.error(page, blob)
+      throw new Error('bad contents')
+    }
+    await fs.promises.mkdir(CACHE_IN, { recursive: true })
+    await fs.promises.writeFile(path, JSON.stringify(parsed, null, 2), 'utf-8')
+    return parsed
+  }
+}
+
+async function getAllPagesFrom(baseUrl, continueFrom) {
+  // https://learn.overleaf.com/learn/Special:ApiSandbox#action=query&format=json&generator=allpages&gapfilterredir=nonredirects
+  const uri = new URL(baseUrl + '/learn-scripts/api.php')
+  uri.search = new URLSearchParams({
+    action: 'query',
+    format: 'json',
+    generator: 'allpages',
+    // Ignore pages with redirects. We do not want to check page content twice.
+    gapfilterredir: 'nonredirects',
+    // Bump the default page size of 10.
+    gaplimit: 100,
+    ...continueFrom,
+  }).toString()
+
+  let blob
+  try {
+    blob = await fetchJson(uri)
+  } catch (err) {
+    if (err instanceof RequestFailedError) {
+      console.error(err.response.status, continueFrom, err.response)
+    } else {
+      console.error(err)
+      throw err
+    }
+  }
+  const nextContinueFrom = blob && blob.continue
+  const pagesRaw = (blob && blob.query && blob.query.pages) || {}
+  const pages = Object.values(pagesRaw).map(page => page.title)
+  return { nextContinueFrom, pages }
+}
+
+async function getAllPages(baseUrl) {
+  let continueFrom = {}
+  let allPages = []
+  while (true) {
+    const { nextContinueFrom, pages } = await getAllPagesFrom(
+      baseUrl,
+      continueFrom
+    )
+    allPages = allPages.concat(pages)
+    if (!nextContinueFrom) break
+    continueFrom = nextContinueFrom
+  }
+  return allPages.sort()
+}
+
+async function getAllPagesAndCache(baseUrl) {
+  const path = Path.join(CACHE_IN, 'allPages.txt')
+  try {
+    return JSON.parse(await fs.promises.readFile(path, 'utf-8'))
+  } catch (e) {
+    const allPages = await getAllPages(baseUrl)
+    await fs.promises.mkdir(CACHE_IN, { recursive: true })
+    await fs.promises.writeFile(path, JSON.stringify(allPages), 'utf-8')
+    return allPages
+  }
+}
+
+export default {
+  getAllPagesAndCache,
+  scrapeAndCachePage,
+}