Willow
0
Q:

web scraping with UrlFetchApp

/**
 * Scrape URL, return whatever you choose with jquery-style selectors. Dependency: cheeriogs, see https://github.com/fgborges/cheeriogs
 *
 * @param {url} valid start-url
 * @return result (array values)
 *
 * @customfunction
 */
function scraper(url) {
    var result = [];
    var title, ogurl, ogtitle, ogimage, ogdescription, description, h1, robots, canonical;
    var options = {
        'muteHttpExceptions': true,
        'followRedirects': false,
    };
    try {
        // trim url to prevent (rare) errors
        url.toString().trim();
        var r = UrlFetchApp.fetch(url, options);
        var c = r.getResponseCode();
        // check for meta refresh if 200 ok
        if (c == 200) {
            var html = r.getContentText();
            var $ = Cheerio.load(html); // make sure this lib is added to your project!
            // meta robots
            if ($('meta[name="robots"]').attr('content')) {
                robots = $('meta[name="robots"]').attr('content').trim();
            }
            // canonical
            if ($("link[rel='canonical']").attr("href")) {
                canonical = $("link[rel='canonical']").attr("href");
            }
            // meta title
            if ($('title')) {
                title = $('title').text().trim();
            }
            // meta description
            if ($('meta[name=description]').attr("content")) {
                description = $('meta[name=description]').attr("content").trim();
            }
            // h1
            if ($('h1')) {
                h1 = $('h1').text().trim();
            }
            // open graph url
            if ($('meta[property="og:url"]').attr('content')) {
                ogurl = $('meta[property="og:url"]').attr('content').trim();
            }
            // open graph title
            if ($('meta[property="og:title"]').attr('content')) {
                ogtitle = $('meta[property="og:title"]').attr('content').trim();
            }
            // open graph image
            if ($('meta[property="og:image"]').attr('content')) {
                ogimage = $('meta[property="og:image"]').attr('content');
            }
            // open graph description
            if ($('meta[property="og:description"]').attr('content')) {
                ogdescription = $('meta[property="og:description"]').attr('content').trim();
            }
        }
        result.push([c, canonical, robots, title, description, h1, ogurl, ogtitle, ogimage, ogdescription]);
    } catch (error) {
        result.push(error.toString());
    } finally {
        return result;
    }
}
0

New to Communities?

Join the community