Try Documentalist,
my app that offers fast, offline access to 190+ programmer API docs.
chromedp
retrive response body: https://github.com/chromedp/chromedp/issues/427
https://github.com/chromedp/chromedp/issues/42 how to save requests
Articles
- https://blog.shapesecurity.com/2018/09/17/intercepting-and-modifying-responses-with-chrome-via-the-devtools-protocol/
- https://docs.browserless.io/blog/2019/03/13/more-observations.html
- https://blog.lovemily.me/a-deep-dive-guide-for-crawling-spa-with-puppeteer-and-troubleshooting/
- https://docs.browserless.io/blog/2018/06/04/puppeteer-best-practices.html
- https://nemethgergely.com/puppeteer-browser-automation/
- https://blog.lovemily.me/a-deep-dive-guide-for-crawling-spa-with-puppeteer-and-troubleshooting/#get-async-ajax-data
- https://www.santoshsrinivas.com/puppeteer-api-to-control-headless-chrome/
- https://tutorialzine.com/2017/08/automating-google-chrome-with-node-js
- https://developers.google.com/web/updates/2017/04/headless-chrome
- https://github.com/yujiosaka/headless-chrome-crawler
- https://medium.com/@e_mad_ehsan/getting-started-with-puppeteer-and-chrome-headless-for-web-scrapping-6bf5979dee3e
- https://codeburst.io/a-guide-to-automating-scraping-the-web-with-javascript-chrome-puppeteer-node-js-b18efb9e9921
- https://www.sitepen.com/blog/2017/10/04/browser-automation-with-puppeteer/
- https://intoli.com/blog/saving-images/
- https://intoli.com/blog/making-chrome-headless-undetectable/
- https://alligator.io/tooling/puppeteer/
- https://www.youtube.com/watch?v=lhZOFUY1weo : 10 things to do with puppeteer
- https://github.com/ChromeDevTools/awesome-chrome-devtools
- https://chromium.googlesource.com/chromium/src/+/lkgr/headless/README.md
- https://chromedevtools.github.io/devtools-protocol/
- https://developers.google.com/web/updates/2017/04/headless-chrome
- https://developers.google.com/web/updates/2017/04/headless-chrome, https://news.ycombinator.com/item?id=14239194
- https://adriftwith.me/coding/2017/04/21/headless-slimerjs-with-firefox/
- https://levels.io/phantomjs-social-media-share-pictures/
- https://riston.github.io/post/headless-chrome-scrape/ : example of scraping links from r/programming, using node library
- https://riston.github.io/post/puppeteer-chrome-scrape/
- http://i-programmer.info/news/87-web-development/11344-headless-chrome-and-the-puppeteer-library-for-scraping-and-testing-the-web.html
Tools
Note that both of those don't work more often than they do work.
- https://try-puppeteer.appspot.com/
- https://puppeteersandbox.com/
- https://github.com/adieuadieu/serverless-chrome : how to run headless chrome on AWS
- https://github.com/gajus/usus : Renders page using Chrome Debugging Protocol
scripts
pupeteer script to do google searches
const {Browser} = require('puppeteer');
const browser = new Browser({headless: false});
browser.newPage().then(async page => {
page.on('load', () => console.log('LOADED: ' + page.url()));
await page.navigate('https://google.com');
await page.waitFor('input[name=q]');
await page.focus('input[name=q]');
await page.type('blin');
await page.press('Enter');
for (let i = 0; i < 10; ++i) {
let searchResult = `div.g:nth-child(${i + 1}) h3 a`;
await page.waitFor(searchResult, {visible: true});
page.click(searchResult);
await page.waitForNavigation();
await page.screenshot({path: `screenshot-${i + 1}.png`});
await page.goBack();
}
browser.close();
});
pupeteer script to extract urls for a given search query
const browser = await puppeteer.launch();
const page = await browser.newPage();
await page.goto('https://google.com', {waitUntil: 'networkidle2'});
await page.waitFor('input[name=q]');
// Type our query into the search bar
await page.type('input[name=q]', 'epub reader windows');
await page.click('input[type="submit"]');
// Wait for the results to show up
await page.waitForSelector('h3 a');
// Extract the results from the page
const links = await page.evaluate(() => {
const anchors = Array.from(document.querySelectorAll('h3.r a'));
return anchors.map(anchor => anchor.href);
});
console.log(links.join('\n'));
await browser.close();
pupeteer script to extract first 100 search results for query
const browser = await puppeteer.launch();
const page = await browser.newPage();
// https://www.google.com/search?q=epub+reader&start=10
function buildQuery(term, page=0) {
const q = encodeURIComponent(term).replace("%20", "+");
let url = 'https://google.com/search?q=' + q;
if (page > 0) {
const start = page * 10;
url += '&start=' + start;
}
return url;
}
async function extractLinks(url) {
console.log(`url: ${url}`);
await page.goto(url, {waitUntil: 'networkidle2'});
// Wait for the results to show up
await page.waitForSelector('h3 a');
// Extract the results from the page
const links = await page.evaluate(() => {
const anchors = Array.from(document.querySelectorAll('h3.r a'));
return anchors.map(anchor => anchor.href);
});
return links;
}
async function extractFirst100(term) {
const maxPages = 1; // 10
for (var pageNo = 0; pageNo < maxPages; pageNo++) {
const url = buildQuery(term, pageNo);
links = await extractLinks(url);
console.log(links.join('\n'));
}
}
async function extractFirst100v2(term) {
const maxPages = 3; // 10
let a = Array(maxPages);
for (var pageNo = 0; pageNo < maxPages; pageNo++) {
const url = buildQuery(term, pageNo);
a[pageNo] = extractLinks(url);
}
const res = await Promise.all(a);
for (var pageNo = 0; pageNo < maxPages; pageNo++) {
const links = res[pageNo];
console.log(links.join('\n'));
}
}
await extractFirst100("epub reader");
await browser.close();
libraries
For Go:
- https://github.com/chromedp/chromedp : Go library to drive Chrome via debugger protocol
- https://github.com/mafredri/cdp : most high-level
- https://github.com/raff/godet : low-level
- https://github.com/wirepair/gcd : low-level