Home / Programming / web scraping and crawling / puppeteer, headless chrome, cdp, chromedp edit
Try Documentalist, my app that offers fast, offline access to 190+ programmer API docs.

  • scripts
    • pupeteer script to do google searches
      const {Browser} = require('puppeteer');
      const browser = new Browser({headless: false});
      
      browser.newPage().then(async page => {
        page.on('load', () => console.log('LOADED: ' + page.url()));
        await page.navigate('https://google.com');
        await page.waitFor('input[name=q]');
        await page.focus('input[name=q]');
        await page.type('blin');
        await page.press('Enter');
        for (let i = 0; i < 10; ++i) {
          let searchResult = `div.g:nth-child(${i + 1}) h3 a`;
          await page.waitFor(searchResult, {visible: true});
          page.click(searchResult);
          await page.waitForNavigation();
          await page.screenshot({path: `screenshot-${i + 1}.png`});
          await page.goBack();
        }
        browser.close();
      });
      
    • pupeteer script to extract urls for a given search query
      const browser = await puppeteer.launch();
      const page = await browser.newPage();
      await page.goto('https://google.com', {waitUntil: 'networkidle2'});
      
      await page.waitFor('input[name=q]');
      // Type our query into the search bar
      await page.type('input[name=q]', 'epub reader windows');
      
      await page.click('input[type="submit"]');
      
      // Wait for the results to show up
      await page.waitForSelector('h3 a');
      
      // Extract the results from the page
      const links = await page.evaluate(() => {
        const anchors = Array.from(document.querySelectorAll('h3.r a'));
        return anchors.map(anchor => anchor.href);
      });
      console.log(links.join('\n'));
      await browser.close();
      
    • pupeteer script to extract first 100 search results for query
      const browser = await puppeteer.launch();
      const page = await browser.newPage();
      
      // https://www.google.com/search?q=epub+reader&start=10
      function buildQuery(term, page=0) {
        const q = encodeURIComponent(term).replace("%20", "+");
        let url = 'https://google.com/search?q=' + q;
        if (page > 0) {
          const start = page * 10;
          url += '&start=' + start;
        }
        return url;
      }
      
      async function extractLinks(url) {
          console.log(`url: ${url}`);
          await page.goto(url, {waitUntil: 'networkidle2'});
          // Wait for the results to show up
          await page.waitForSelector('h3 a');
          // Extract the results from the page
          const links = await page.evaluate(() => {
              const anchors = Array.from(document.querySelectorAll('h3.r a'));
              return anchors.map(anchor => anchor.href);
          });
          return links;
      }
      
      async function extractFirst100(term) {
          const maxPages = 1; // 10
          for (var pageNo = 0; pageNo < maxPages;  pageNo++) {
              const url = buildQuery(term, pageNo);
              links = await extractLinks(url);
              console.log(links.join('\n'));
          }
      }
      
      async function extractFirst100v2(term) {
          const maxPages = 3; // 10
          let a = Array(maxPages);
          for (var pageNo = 0; pageNo < maxPages;  pageNo++) {
              const url = buildQuery(term, pageNo);
              a[pageNo] = extractLinks(url);
          }
          const res = await Promise.all(a);
          for (var pageNo = 0; pageNo < maxPages; pageNo++) {
              const links = res[pageNo];
              console.log(links.join('\n'));      
          }
      }
      
      await extractFirst100("epub reader");
      
      await browser.close();
      

Feedback about page:

Feedback:
Optional: your email if you want me to get back to you:

Share on        

Need fast, offline access to 190+ programmer API docs? Try my app Documentalist for Windows