164 lines
4.4 KiB
JavaScript
Executable File
164 lines
4.4 KiB
JavaScript
Executable File
#!/usr/bin/env node
|
|
|
|
const fs = require('fs');
|
|
const path = require('path');
|
|
const prompt = require('prompt-sync')();
|
|
const url = require('url');
|
|
const cheerio = require('cheerio');
|
|
const { Readable, finished } = require('stream');
|
|
|
|
const removeTextBeforeUnderscore = (str) => {
|
|
const underscoreIndex = str.indexOf('_');
|
|
if (underscoreIndex === -1) {
|
|
return str;
|
|
}
|
|
return str.slice(underscoreIndex + 1);
|
|
}
|
|
|
|
async function downloadImage(url, path) {
|
|
const response = await fetch(url);
|
|
const buffer = await response.arrayBuffer();
|
|
await fs.promises.writeFile(`.${path}`, Buffer.from(buffer));
|
|
}
|
|
|
|
async function downloadImages(body) {
|
|
try {
|
|
fs.mkdirSync("images");
|
|
} catch (err) {
|
|
if (err.code !== 'EEXIST') {
|
|
throw err;
|
|
}
|
|
}
|
|
// Load the webpage
|
|
const $ = cheerio.load(body);
|
|
const imgPromises = [];
|
|
// Download all images on the page
|
|
$('img').each(async (i, el) => {
|
|
const imgUrl = $(el).attr('src');
|
|
const imgName = removeTextBeforeUnderscore(path.basename(imgUrl));
|
|
|
|
|
|
// Download the image and save it locally
|
|
const imgPromise = downloadImage(imgUrl, `/images/${imgName}`);
|
|
|
|
// Replace the image URL with the local path
|
|
$(el).attr('src', path.join('/images', imgName));
|
|
$(el).removeAttr('srcset')
|
|
imgPromises.push(imgPromise);
|
|
});
|
|
|
|
const faviconLink = $('link[rel="icon"], link[rel="shortcut icon"]').first();
|
|
const faviconUrl = faviconLink.attr('href');
|
|
|
|
imgPromises.push(downloadImage(faviconUrl, `/images/favicon.ico`))
|
|
faviconLink.href = "/images/favicon.ico"
|
|
|
|
await Promise.all(imgPromises);
|
|
|
|
return $.html()
|
|
}
|
|
|
|
async function findAllPages(body, url) {
|
|
const $ = cheerio.load(body);
|
|
const links = $('a').map((i, el) => $(el).attr('href')).get();
|
|
const results = new Set()
|
|
for (const link of links) {
|
|
if (link[0] === "/" || link.startsWith(url)) {
|
|
results.add(link)
|
|
}
|
|
}
|
|
|
|
return results
|
|
}
|
|
|
|
function removeBranding(js) {
|
|
return js.replace('shouldBrand = true;', 'shouldBrand = false;')
|
|
}
|
|
|
|
async function downloadAssets(htmlContent, websiteUrl) {
|
|
const $ = cheerio.load(htmlContent);
|
|
const assetPromises = [];
|
|
|
|
$('link[rel="stylesheet"][href]').each((i, el) => {
|
|
const cssUrl = $(el).attr('href');
|
|
if (cssUrl.startsWith('https://uploads-ssl.webflow.com')) {
|
|
const cssName = "style.css";
|
|
const cssPath = path.join('./css', cssName);
|
|
assetPromises.push(
|
|
fetch(cssUrl)
|
|
.then(res => res.text())
|
|
.then(css => {
|
|
fs.createWriteStream(cssPath).write(css);
|
|
$(el).attr('href', `/css/${cssName}`);
|
|
})
|
|
);
|
|
}
|
|
});
|
|
|
|
$('script[src]').each((i, el) => {
|
|
const jsUrl = $(el).attr('src');
|
|
if (jsUrl.startsWith('https://uploads-ssl.webflow.com')) {
|
|
const jsName = "script.js";
|
|
const jsPath = path.join('./js', jsName);
|
|
assetPromises.push(
|
|
fetch(jsUrl)
|
|
.then(res => res.text())
|
|
.then(js => {
|
|
js = removeBranding(js)
|
|
fs.createWriteStream(jsPath).write(js);
|
|
$(el).attr('src', `/js/${jsName}`);
|
|
})
|
|
);
|
|
}
|
|
});
|
|
|
|
assetPromises.push(
|
|
fetch(`${websiteUrl}/sitemap.xml`)
|
|
.then(res => res.text())
|
|
.then(asset => {
|
|
fs.createWriteStream("./sitemap.xml").write(asset);
|
|
})
|
|
)
|
|
|
|
assetPromises.push(
|
|
fetch(`${websiteUrl}/robots.txt`)
|
|
.then(res => res.text())
|
|
.then(asset => {
|
|
fs.createWriteStream("./robots.txt").write(asset);
|
|
})
|
|
)
|
|
|
|
await Promise.all(assetPromises);
|
|
|
|
return $.html();
|
|
}
|
|
|
|
async function main() {
|
|
let siteUrl = prompt("Website URL: ")
|
|
let siteBody = await fetch(siteUrl).then(res => res.text());
|
|
let pages = await findAllPages(siteBody, siteUrl)
|
|
pages.add('/')
|
|
// let correctPages = prompt(`I have found ${pages.length} pages, is this correct? [n/Y]: `) || "Y";
|
|
// if (correctPages.toLowerCase() !== "y") {
|
|
// // TODO: add manual page adding
|
|
// return
|
|
// }
|
|
|
|
fs.mkdirSync('js')
|
|
fs.mkdirSync('css')
|
|
fs.mkdirSync('images')
|
|
|
|
pages.forEach(page => {
|
|
fetch(siteUrl + page)
|
|
.then(data => data.text())
|
|
.then(async res => {
|
|
let newHTML = await downloadImages(res)
|
|
newHTML = await downloadAssets(newHTML, siteUrl)
|
|
const download_write_stream = fs.createWriteStream(`./${page === "/" ? "index" : page.substring((page.lastIndexOf('/')) + 1)}.html`);
|
|
download_write_stream.write(newHTML)
|
|
})
|
|
})
|
|
|
|
}
|
|
|
|
main().catch(error => console.error(error)); |