Creating Puppeteer Web Crawler

Overview

I configured a crawling bot that collects email addresses from web pages using Puppeteer and Cheerio modules.
It collects email addresses from the Trigger Page and explores <a href=...> to visit without duplicates.

Module Documentation

👉 Puppeteer Documentation
👉 Cheerio

Source Code

// crawl.js
// 3rd party declaration
import * as puppeteer from 'puppeteer';
import * as cheerio from 'cheerio';
// own libraries declaration
import Queue from './queue.js';
 
/**
 * Concept:
 *  1. dequeue url from visitQueue
 *  2. visit url
 *  3. extract email address list
 *  4. extract href, enqueue to visitQueue
 *  6. repeat 1 ~ 4
 */
(async() => {
    const emails = new Set();
    const histories = new Set();
    const visitQueue = new Queue();
 
    const browser = await puppeteer.launch();
    const workPage = await browser.newPage();
 
    // Trigger Page setup
    visitQueue.enqueue("https://www.naver.com");
    while (!visitQueue.isEmpty()) {
        // 1.
        const url = visitQueue.dequeue();
        // 2.
        await workPage.goto(url);
        // 3.
        const $ = cheerio.load(await workPage.content());
        for (let email of extractEmails($)) {
            emails.add(email);
        }
        // 4.
        for (let href of validateHrefs(extractHrefs($), histories)) {
            visitQueue.enqueue(href);
        }
        // logging
        console.log(emails);
    }
})();
 
function extractEmails($) {
    /**
     * RFC2822 Email Validation
     */
    return $('body').text().match(/[a-z0-9!#$%&'*+/=?^_`{|}~-]+(?:\.[a-z0-9!#$%&'*+/=?^_`{|}~-]+)*@(?:[a-z0-9](?:[a-z0-9-]*[a-z0-9])?\.)+[a-z0-9](?:[a-z0-9-]*[a-z0-9])?/g) || [];
}
 
function extractHrefs($) {
    const hrefs = [];
 
    $('a').each( (i, a) => {
        const href = $(a).attr('href') || "";
        if (href.startsWith("https://") || href.startsWith("http://")) {
            hrefs.push(href);
        }
    } );
 
    return hrefs;
}
 
function validateHrefs(hrefs, histories) {
    return hrefs.filter( href => !histories.has(href) )
}
 
async function wait(seconds) {
    return new Promise( resolve => setTimeout(resolve, seconds * 1000) )
}

// queue.js
export default class Queue {
    constructor() {
        this.arr = []
    }
 
    enqueue(element) {
        this.arr.push(element)
    }
 
    dequeue() {
        return this.arr.shift()
    }
 
    isEmpty() {
        return this.arr.length === 0 ? true : false
    }
}

Improvement Directions

After trimming Query and Path Parameters, duplicate checking should be performed to avoid visiting meaningless pages multiple times. (Using Canonical Tag might work?)
When href contains paths like '/path', './path' instead of full URLs, they are discarded, but if baseUrl can be obtained, path and base_url composition would be possible.