first commit
This commit is contained in:
commit
346413dc1a
1
.gitignore
vendored
Normal file
1
.gitignore
vendored
Normal file
@ -0,0 +1 @@
|
||||
./node_modules
|
164
main.js
Executable file
164
main.js
Executable file
@ -0,0 +1,164 @@
|
||||
#!/usr/bin/env node
|
||||
|
||||
const fs = require('fs');
|
||||
const path = require('path');
|
||||
const prompt = require('prompt-sync')();
|
||||
const url = require('url');
|
||||
const cheerio = require('cheerio');
|
||||
const { Readable, finished } = require('stream');
|
||||
|
||||
const removeTextBeforeUnderscore = (str) => {
|
||||
const underscoreIndex = str.indexOf('_');
|
||||
if (underscoreIndex === -1) {
|
||||
return str;
|
||||
}
|
||||
return str.slice(underscoreIndex + 1);
|
||||
}
|
||||
|
||||
async function downloadImage(url, path) {
|
||||
const response = await fetch(url);
|
||||
const buffer = await response.arrayBuffer();
|
||||
await fs.promises.writeFile(`.${path}`, Buffer.from(buffer));
|
||||
}
|
||||
|
||||
async function downloadImages(body) {
|
||||
try {
|
||||
fs.mkdirSync("images");
|
||||
} catch (err) {
|
||||
if (err.code !== 'EEXIST') {
|
||||
throw err;
|
||||
}
|
||||
}
|
||||
// Load the webpage
|
||||
const $ = cheerio.load(body);
|
||||
const imgPromises = [];
|
||||
// Download all images on the page
|
||||
$('img').each(async (i, el) => {
|
||||
const imgUrl = $(el).attr('src');
|
||||
const imgName = removeTextBeforeUnderscore(path.basename(imgUrl));
|
||||
|
||||
|
||||
// Download the image and save it locally
|
||||
const imgPromise = downloadImage(imgUrl, `/images/${imgName}`);
|
||||
|
||||
// Replace the image URL with the local path
|
||||
$(el).attr('src', path.join('/images', imgName));
|
||||
$(el).removeAttr('srcset')
|
||||
imgPromises.push(imgPromise);
|
||||
});
|
||||
|
||||
const faviconLink = $('link[rel="icon"], link[rel="shortcut icon"]').first();
|
||||
const faviconUrl = faviconLink.attr('href');
|
||||
|
||||
imgPromises.push(downloadImage(faviconUrl, `/images/favicon.ico`))
|
||||
faviconLink.href = "/images/favicon.ico"
|
||||
|
||||
await Promise.all(imgPromises);
|
||||
|
||||
return $.html()
|
||||
}
|
||||
|
||||
async function findAllPages(body, url) {
|
||||
const $ = cheerio.load(body);
|
||||
const links = $('a').map((i, el) => $(el).attr('href')).get();
|
||||
const results = new Set()
|
||||
for (const link of links) {
|
||||
if (link[0] === "/" || link.startsWith(url)) {
|
||||
results.add(link)
|
||||
}
|
||||
}
|
||||
|
||||
return results
|
||||
}
|
||||
|
||||
function removeBranding(js) {
|
||||
return js.replace('shouldBrand = true;', 'shouldBrand = false;')
|
||||
}
|
||||
|
||||
async function downloadAssets(htmlContent, websiteUrl) {
|
||||
const $ = cheerio.load(htmlContent);
|
||||
const assetPromises = [];
|
||||
|
||||
$('link[rel="stylesheet"][href]').each((i, el) => {
|
||||
const cssUrl = $(el).attr('href');
|
||||
if (cssUrl.startsWith('https://uploads-ssl.webflow.com')) {
|
||||
const cssName = "style.css";
|
||||
const cssPath = path.join('./css', cssName);
|
||||
assetPromises.push(
|
||||
fetch(cssUrl)
|
||||
.then(res => res.text())
|
||||
.then(css => {
|
||||
fs.createWriteStream(cssPath).write(css);
|
||||
$(el).attr('href', `/css/${cssName}`);
|
||||
})
|
||||
);
|
||||
}
|
||||
});
|
||||
|
||||
$('script[src]').each((i, el) => {
|
||||
const jsUrl = $(el).attr('src');
|
||||
if (jsUrl.startsWith('https://uploads-ssl.webflow.com')) {
|
||||
const jsName = "script.js";
|
||||
const jsPath = path.join('./js', jsName);
|
||||
assetPromises.push(
|
||||
fetch(jsUrl)
|
||||
.then(res => res.text())
|
||||
.then(js => {
|
||||
js = removeBranding(js)
|
||||
fs.createWriteStream(jsPath).write(js);
|
||||
$(el).attr('src', `/js/${jsName}`);
|
||||
})
|
||||
);
|
||||
}
|
||||
});
|
||||
|
||||
assetPromises.push(
|
||||
fetch(`${websiteUrl}/sitemap.xml`)
|
||||
.then(res => res.text())
|
||||
.then(asset => {
|
||||
fs.createWriteStream("./sitemap.xml").write(asset);
|
||||
})
|
||||
)
|
||||
|
||||
assetPromises.push(
|
||||
fetch(`${websiteUrl}/robots.txt`)
|
||||
.then(res => res.text())
|
||||
.then(asset => {
|
||||
fs.createWriteStream("./robots.txt").write(asset);
|
||||
})
|
||||
)
|
||||
|
||||
await Promise.all(assetPromises);
|
||||
|
||||
return $.html();
|
||||
}
|
||||
|
||||
async function main() {
|
||||
let siteUrl = prompt("Website URL: ")
|
||||
let siteBody = await fetch(siteUrl).then(res => res.text());
|
||||
let pages = await findAllPages(siteBody, siteUrl)
|
||||
pages.add('/')
|
||||
// let correctPages = prompt(`I have found ${pages.length} pages, is this correct? [n/Y]: `) || "Y";
|
||||
// if (correctPages.toLowerCase() !== "y") {
|
||||
// // TODO: add manual page adding
|
||||
// return
|
||||
// }
|
||||
|
||||
fs.mkdirSync('js')
|
||||
fs.mkdirSync('css')
|
||||
fs.mkdirSync('images')
|
||||
|
||||
pages.forEach(page => {
|
||||
fetch(siteUrl + page)
|
||||
.then(data => data.text())
|
||||
.then(async res => {
|
||||
let newHTML = await downloadImages(res)
|
||||
newHTML = await downloadAssets(newHTML, siteUrl)
|
||||
const download_write_stream = fs.createWriteStream(`./${page === "/" ? "index" : page.substring((page.lastIndexOf('/')) + 1)}.html`);
|
||||
download_write_stream.write(newHTML)
|
||||
})
|
||||
})
|
||||
|
||||
}
|
||||
|
||||
main().catch(error => console.error(error));
|
1260
package-lock.json
generated
Normal file
1260
package-lock.json
generated
Normal file
File diff suppressed because it is too large
Load Diff
21
package.json
Normal file
21
package.json
Normal file
@ -0,0 +1,21 @@
|
||||
{
|
||||
"name": "de-webflower",
|
||||
"version": "1.0.0",
|
||||
"description": "",
|
||||
"main": "main.js",
|
||||
"scripts": {
|
||||
"test": "echo \"Error: no test specified\" && exit 1"
|
||||
},
|
||||
"bin": {
|
||||
"deflow": "./main.js"
|
||||
},
|
||||
"keywords": [],
|
||||
"author": "",
|
||||
"license": "ISC",
|
||||
"dependencies": {
|
||||
"cheerio": "^1.0.0-rc.12",
|
||||
"node-fetch": "^3.3.0",
|
||||
"prompt-sync": "^4.2.0",
|
||||
"request": "^2.88.2"
|
||||
}
|
||||
}
|
Loading…
Reference in New Issue
Block a user