-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Add URL2PDF, Substack; remove Amazon (they are blocking this now)
- Loading branch information
1 parent
6dbf8f0
commit 84c05e3
Showing
15 changed files
with
1,431 additions
and
73 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
.DS_Store |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file was deleted.
Oops, something went wrong.
This file was deleted.
Oops, something went wrong.
This file was deleted.
Oops, something went wrong.
This file was deleted.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,3 @@ | ||
node_modules | ||
package-lock.json | ||
example.pdf |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,37 @@ | ||
# Download URLs as PDF | ||
|
||
Note: This script scrolls down each page before downloading a PDF to ensure all assets are loaded (e.g. lazily loaded images on a page). This significantly increased page save times. | ||
|
||
## Setup | ||
|
||
### Node.js | ||
1. Install nvm: https://github.com/nvm-sh/nvm#installing-and-updating | ||
1. install node: `nvm install --lts` | ||
1. instal depedencies: `cd` to this dir, `npm install` | ||
|
||
### Input JSON | ||
1. Create a JSON file that has the following form: | ||
```json | ||
[ | ||
{ | ||
url: "www.example.com", | ||
title: "file-name-of-pdf", | ||
folder: "name-of-folder" | ||
}, | ||
... | ||
] | ||
``` | ||
1. Create all the folders specified in the JSON file | ||
1. Add the filepath to the value of the `JSON_FILE_PATH` variable in `index.js` | ||
|
||
## Usage | ||
1. `node index.js` | ||
1. [Optional] Login to any websites if the content is behind a login | ||
1. Press any key to continue | ||
1. Output will be in the folder you specified in the JSON with the filename `${title}.pdf`. You must create the folders before running this script. | ||
|
||
|
||
## Testing | ||
1. Add the to the value of the `JSON_FILE_PATH` variable in `index.js` to `test-input.json` | ||
1. Run the script: `node index.js` | ||
1. Observe a file called `example.pdf` that has the contents of the example.com webpage in it. |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,94 @@ | ||
import puppeteer from 'puppeteer'; | ||
import * as fs from 'fs'; | ||
import * as path from 'path'; | ||
|
||
// Expected JSON File form: | ||
// [ | ||
// { | ||
// url: "www.example.com", | ||
// title: "file-name-of-pdf", | ||
// folder: "name-of-folder" | ||
// }, | ||
// ... | ||
// ] | ||
const JSON_FILE_PATH = "test-input.json" | ||
|
||
const PAGE_WIDTH = "1200" | ||
const PAGE_HEIGHT = "800" | ||
|
||
async function main() { | ||
if (JSON_FILE_PATH == "") { | ||
console.log("Add the filepath to the value of the `JSON_FILE_PATH` variable in `index.js`") | ||
process.exit(1) | ||
} | ||
console.log("Launching browser...") | ||
const browser = await puppeteer.launch({ | ||
headless: false, | ||
args: [`--window-size=${PAGE_WIDTH},${PAGE_HEIGHT}`], | ||
defaultViewport: { | ||
width: parseInt(PAGE_WIDTH, 10), | ||
height: parseInt(PAGE_HEIGHT, 10) | ||
} | ||
}); | ||
const pages = await browser.pages() | ||
const page = pages[0] // Get first tab in open browser | ||
|
||
// In case what you need to download is behind a login form | ||
console.log("Login in the browser and then press any key to continue...") | ||
await keypress() | ||
console.log("Key pressed.") | ||
|
||
let rawdata = fs.readFileSync(JSON_FILE_PATH); | ||
let webpages = JSON.parse(rawdata); | ||
for (const webpage of webpages) { | ||
console.log(`Downloading ${webpage.url}`); | ||
|
||
// Navigate to page, wait until all network traffic stops | ||
await page.goto(webpage.url, { waitUntil: 'networkidle2', networkIdleTimeout: 5000 }); | ||
// Scroll through the page to ensure all content loads | ||
await autoScroll(page); | ||
// Save PDF | ||
const filename = `${webpage.title}.pdf` | ||
const filePath = path.join(webpage.folder, filename); | ||
const pdfConfig = { | ||
path: filePath, // Saves file to this location | ||
format: 'A4', | ||
width: `${PAGE_WIDTH}px`, | ||
height: `${PAGE_HEIGHT}px` | ||
}; | ||
await page.pdf(pdfConfig); | ||
} | ||
await browser.close(); | ||
console.log('Done.') | ||
process.exit(0) | ||
} | ||
|
||
|
||
const keypress = async () => { | ||
process.stdin.setRawMode(true) | ||
return new Promise(resolve => process.stdin.once('data', () => { | ||
process.stdin.setRawMode(false) | ||
resolve() | ||
})) | ||
} | ||
|
||
async function autoScroll(page) { | ||
await page.evaluate(async () => { | ||
await new Promise((resolve) => { | ||
var totalHeight = 0; | ||
var distance = 100; | ||
var timer = setInterval(() => { | ||
var scrollHeight = document.body.scrollHeight; | ||
window.scrollBy(0, distance); | ||
totalHeight += distance; | ||
|
||
if (totalHeight >= scrollHeight - window.innerHeight) { | ||
clearInterval(timer); | ||
resolve(); | ||
} | ||
}, 100); | ||
}); | ||
}); | ||
} | ||
|
||
main() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,15 @@ | ||
{ | ||
"name": "download-urls-as-pdf", | ||
"version": "1.0.0", | ||
"description": "", | ||
"main": "index.js", | ||
"type": "module", | ||
"scripts": { | ||
"test": "echo \"Error: no test specified\" && exit 1" | ||
}, | ||
"author": "", | ||
"license": "ISC", | ||
"dependencies": { | ||
"puppeteer": "^19.6.2" | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,7 @@ | ||
[ | ||
{ | ||
"url": "https://www.example.com", | ||
"title": "example", | ||
"folder": "" | ||
} | ||
] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
results.json |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,10 @@ | ||
# Get all Substack URLs for a user | ||
|
||
Note: this was originally written in early 2023 depends on a internal Substack API which may be changed at any time. | ||
|
||
## Setup | ||
1. Install Go: `brew install go` | ||
|
||
## Usage | ||
1. Enter the base URL of the Substack newsletter as the value of the `BASE_URL` variable in `main.go` | ||
1. Run `go run main.go` |
Oops, something went wrong.