Add URL2PDF, Substack; remove Amazon (they are blocking this now)

mattcarrollcode · Jan 30, 2023 · 84c05e3 · 84c05e3
1 parent 6dbf8f0
commit 84c05e3
Show file tree

Hide file tree

Showing 15 changed files with 1,431 additions and 73 deletions.
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1 @@
+.DS_Store
diff --git a/README.md b/README.md
@@ -4,6 +4,7 @@ This is a collection of scripts I've written that may or may not be useful to my
 
 # List of scripts
 * [new-machine-setup](new-machine-setup) is a guide for new machine setup and a `~/.zshrc` template
-* [amazon_seller_feedback](amazon_seller_feedback) is a script to get bad reviews of Amazon sellers
 * [macos-window-resizer](macos-window-resizer) is a script to resize windows for OBS capture on macOS
-* [github-unsubscribe-script](github-unsubscribe-script) is a script that unsubscribes you from all notifications
+* [github-unsubscribe-script](github-unsubscribe-script) is a script that unsubscribes you from all Github notifications
+* [download-webpages-as-pdf](download-webpages-as-pdf) is a script that downloads a set of webpages as PDFs
+* [get-all-substack-urls-for-user](get-all-substack-urls-for-user) is a script that downloads information on all of a user's articles on Substack
diff --git a/amazon_seller_feedback/.python-version b/amazon_seller_feedback/.python-version
diff --git a/amazon_seller_feedback/README.md b/amazon_seller_feedback/README.md
diff --git a/amazon_seller_feedback/amazon_seller_feedback.py b/amazon_seller_feedback/amazon_seller_feedback.py
diff --git a/amazon_seller_feedback/requirements.txt b/amazon_seller_feedback/requirements.txt
diff --git a/download-urls-as-pdf/.gitignore b/download-urls-as-pdf/.gitignore
@@ -0,0 +1,3 @@
+node_modules
+package-lock.json
+example.pdf
diff --git a/download-urls-as-pdf/README.md b/download-urls-as-pdf/README.md
@@ -0,0 +1,37 @@
+# Download URLs as PDF
+
+Note: This script scrolls down each page before downloading a PDF to ensure all assets are loaded (e.g. lazily loaded images on a page). This significantly increased page save times.
+
+## Setup
+
+### Node.js
+1. Install nvm: https://github.com/nvm-sh/nvm#installing-and-updating
+1. install node: `nvm install --lts`
+1. instal depedencies: `cd` to this dir, `npm install`
+
+### Input JSON
+1. Create a JSON file that has the following form:
+   ```json
+   [
+       { 
+           url: "www.example.com",
+           title: "file-name-of-pdf",
+           folder: "name-of-folder" 
+       },
+       ...
+   ]
+   ```
+1. Create all the folders specified in the JSON file
+1. Add the filepath to the value of the `JSON_FILE_PATH` variable in `index.js`
+
+## Usage
+1. `node index.js`
+1. [Optional] Login to any websites if the content is behind a login
+1. Press any key to continue
+1. Output will be in the folder you specified in the JSON with the filename `${title}.pdf`. You must create the folders before running this script.
+
+
+## Testing
+1. Add the to the value of the `JSON_FILE_PATH` variable in `index.js` to `test-input.json`
+1. Run the script: `node index.js`
+1. Observe a file called `example.pdf` that has the contents of the example.com webpage in it.
diff --git a/download-urls-as-pdf/index.js b/download-urls-as-pdf/index.js
@@ -0,0 +1,94 @@
+import puppeteer from 'puppeteer';
+import * as fs from 'fs';
+import * as path from 'path';
+
+// Expected JSON File form:
+// [
+//     { 
+//         url: "www.example.com",
+//         title: "file-name-of-pdf",
+//         folder: "name-of-folder" 
+//     },
+//     ...
+// ]
+const JSON_FILE_PATH = "test-input.json"
+
+const PAGE_WIDTH = "1200"
+const PAGE_HEIGHT = "800"
+
+async function main() {
+    if (JSON_FILE_PATH == "") {
+        console.log("Add the filepath to the value of the `JSON_FILE_PATH` variable in `index.js`")
+        process.exit(1)
+    }
+    console.log("Launching browser...")
+    const browser = await puppeteer.launch({
+        headless: false,
+        args: [`--window-size=${PAGE_WIDTH},${PAGE_HEIGHT}`],
+        defaultViewport: {
+            width: parseInt(PAGE_WIDTH, 10),
+            height: parseInt(PAGE_HEIGHT, 10)
+        }
+    });
+    const pages = await browser.pages()
+    const page = pages[0] // Get first tab in open browser
+
+    // In case what you need to download is behind a login form
+    console.log("Login in the browser and then press any key to continue...")
+    await keypress()
+    console.log("Key pressed.")
+
+    let rawdata = fs.readFileSync(JSON_FILE_PATH);
+    let webpages = JSON.parse(rawdata);
+    for (const webpage of webpages) {
+        console.log(`Downloading ${webpage.url}`);
+
+        // Navigate to page, wait until all network traffic stops
+        await page.goto(webpage.url, { waitUntil: 'networkidle2', networkIdleTimeout: 5000 });
+        // Scroll through the page to ensure all content loads
+        await autoScroll(page);
+        // Save PDF
+        const filename = `${webpage.title}.pdf`
+        const filePath = path.join(webpage.folder, filename);
+        const pdfConfig = {
+            path: filePath, // Saves file to this location
+            format: 'A4',
+            width: `${PAGE_WIDTH}px`,
+            height: `${PAGE_HEIGHT}px`
+        };
+        await page.pdf(pdfConfig);
+    }
+    await browser.close();
+    console.log('Done.')
+    process.exit(0)
+}
+
+
+const keypress = async () => {
+    process.stdin.setRawMode(true)
+    return new Promise(resolve => process.stdin.once('data', () => {
+        process.stdin.setRawMode(false)
+        resolve()
+    }))
+}
+
+async function autoScroll(page) {
+    await page.evaluate(async () => {
+        await new Promise((resolve) => {
+            var totalHeight = 0;
+            var distance = 100;
+            var timer = setInterval(() => {
+                var scrollHeight = document.body.scrollHeight;
+                window.scrollBy(0, distance);
+                totalHeight += distance;
+
+                if (totalHeight >= scrollHeight - window.innerHeight) {
+                    clearInterval(timer);
+                    resolve();
+                }
+            }, 100);
+        });
+    });
+}
+
+main()
diff --git a/download-urls-as-pdf/package.json b/download-urls-as-pdf/package.json
@@ -0,0 +1,15 @@
+{
+  "name": "download-urls-as-pdf",
+  "version": "1.0.0",
+  "description": "",
+  "main": "index.js",
+  "type": "module",
+  "scripts": {
+    "test": "echo \"Error: no test specified\" && exit 1"
+  },
+  "author": "",
+  "license": "ISC",
+  "dependencies": {
+    "puppeteer": "^19.6.2"
+  }
+}
diff --git a/download-urls-as-pdf/test-input.json b/download-urls-as-pdf/test-input.json
@@ -0,0 +1,7 @@
+[
+    {
+        "url": "https://www.example.com",
+        "title": "example",
+        "folder": ""
+    }
+]
diff --git a/get-all-substack-urls-for-user/.gitignore b/get-all-substack-urls-for-user/.gitignore
@@ -0,0 +1 @@
+results.json
diff --git a/get-all-substack-urls-for-user/README.md b/get-all-substack-urls-for-user/README.md
@@ -0,0 +1,10 @@
+# Get all Substack URLs for a user
+
+Note: this was originally written in early 2023 depends on a internal Substack API which may be changed at any time.
+
+## Setup
+1. Install Go: `brew install go`
+
+## Usage
+1. Enter the base URL of the Substack newsletter as the value of the `BASE_URL` variable in `main.go`
+1. Run `go run main.go`