#!/usr/bin/env node const fs = require('fs'); const path = require('path'); const prompt = require('prompt-sync')(); const url = require('url'); const cheerio = require('cheerio'); const { Readable, finished } = require('stream'); const removeTextBeforeUnderscore = (str) => { const underscoreIndex = str.indexOf('_'); if (underscoreIndex === -1) { return str; } return str.slice(underscoreIndex + 1); } async function downloadImage(url, path) { const response = await fetch(url); const buffer = await response.arrayBuffer(); await fs.promises.writeFile(`.${path}`, Buffer.from(buffer)); } async function downloadImages(body) { try { fs.mkdirSync("images"); } catch (err) { if (err.code !== 'EEXIST') { throw err; } } // Load the webpage const $ = cheerio.load(body); const imgPromises = []; // Download all images on the page $('img').each(async (i, el) => { const imgUrl = $(el).attr('src'); const imgName = removeTextBeforeUnderscore(path.basename(imgUrl)); // Download the image and save it locally const imgPromise = downloadImage(imgUrl, `/images/${imgName}`); // Replace the image URL with the local path $(el).attr('src', path.join('/images', imgName)); $(el).removeAttr('srcset') imgPromises.push(imgPromise); }); const faviconLink = $('link[rel="icon"], link[rel="shortcut icon"]').first(); const faviconUrl = faviconLink.attr('href'); imgPromises.push(downloadImage(faviconUrl, `/images/favicon.ico`)) faviconLink.href = "/images/favicon.ico" await Promise.all(imgPromises); return $.html() } async function findAllPages(body, url) { const $ = cheerio.load(body); const links = $('a').map((i, el) => $(el).attr('href')).get(); const results = new Set() for (const link of links) { if (link[0] === "/" || link.startsWith(url)) { results.add(link) } } return results } function removeBranding(js) { return js.replace('shouldBrand = true;', 'shouldBrand = false;') } async function downloadAssets(htmlContent, websiteUrl) { const $ = cheerio.load(htmlContent); const assetPromises = []; $('link[rel="stylesheet"][href]').each((i, el) => { const cssUrl = $(el).attr('href'); if (cssUrl.startsWith('https://uploads-ssl.webflow.com')) { const cssName = "style.css"; const cssPath = path.join('./css', cssName); assetPromises.push( fetch(cssUrl) .then(res => res.text()) .then(css => { fs.createWriteStream(cssPath).write(css); $(el).attr('href', `/css/${cssName}`); }) ); } }); $('script[src]').each((i, el) => { const jsUrl = $(el).attr('src'); if (jsUrl.startsWith('https://uploads-ssl.webflow.com')) { const jsName = "script.js"; const jsPath = path.join('./js', jsName); assetPromises.push( fetch(jsUrl) .then(res => res.text()) .then(js => { js = removeBranding(js) fs.createWriteStream(jsPath).write(js); $(el).attr('src', `/js/${jsName}`); }) ); } }); assetPromises.push( fetch(`${websiteUrl}/sitemap.xml`) .then(res => res.text()) .then(asset => { fs.createWriteStream("./sitemap.xml").write(asset); }) ) assetPromises.push( fetch(`${websiteUrl}/robots.txt`) .then(res => res.text()) .then(asset => { fs.createWriteStream("./robots.txt").write(asset); }) ) await Promise.all(assetPromises); return $.html(); } async function main() { let siteUrl = prompt("Website URL: ") let siteBody = await fetch(siteUrl).then(res => res.text()); let pages = await findAllPages(siteBody, siteUrl) pages.add('/') // let correctPages = prompt(`I have found ${pages.length} pages, is this correct? [n/Y]: `) || "Y"; // if (correctPages.toLowerCase() !== "y") { // // TODO: add manual page adding // return // } fs.mkdirSync('js') fs.mkdirSync('css') fs.mkdirSync('images') pages.forEach(page => { fetch(siteUrl + page) .then(data => data.text()) .then(async res => { let newHTML = await downloadImages(res) newHTML = await downloadAssets(newHTML, siteUrl) const download_write_stream = fs.createWriteStream(`./${page === "/" ? "index" : page.substring((page.lastIndexOf('/')) + 1)}.html`); download_write_stream.write(newHTML) }) }) } main().catch(error => console.error(error));