Chrome Extension - How to Scrape Data from a List of Pages

javascript
Published on December 8, 2020

This tutorial explains how to create a Google Chrome extension that will navigate through a list of given URLs, and save data from each page to a file.

The extension includes the following features :

  • Automate multiple URLs to open in the same tab — one after the other — from a given list.
  • Scraping data from each page to a JSON file.
  • Saving the files in the Downloads directory.

This tutorial assumes that the reader already knows how to create a basic Chrome extension. The current tutorial just covers the Javascript & HTML code involved in the extension files — manifest.json, background.js, popup.html, popup.js & the content script.

manifest.json

{
	"name": "UA Demo Extension",
	"version": "1.0",
	"description": "UA Demo Extension",
	"permissions": ["declarativeContent", "activeTab", "tabs", "downloads", "*://*/*"],
	"background": {
    	"scripts": ["background.js"],
      	"persistent": false
    },
    "page_action": {
    	"default_popup": "popup.html",
    },
	"manifest_version": 2
}

The following permissions are required :

  • declarativeContent - to load extension only when page url is of a specific hostname
  • activeTab - to access the active tab
  • tabs - to open URL in tab
  • downloads - to save files in Downloads directory
  • *://*/* - match pattern permission to access DOM content of the given URLs

background.js

'use strict';

// activate extension when host is www.website.com
chrome.runtime.onInstalled.addListener(function() {
	chrome.declarativeContent.onPageChanged.removeRules(undefined, function() {
		chrome.declarativeContent.onPageChanged.addRules([{
				conditions: [new chrome.declarativeContent.PageStateMatcher({
					pageUrl: {hostEquals: 'www.website.com'},
				})
			],
		    actions: [new chrome.declarativeContent.ShowPageAction()]
		}]);
	});
});

For the sake of the tutorial, extension will get activated if hostname of the current page is www.website.com

popup.html

<!DOCTYPE html>
<html>
<head>
<style>
button {
    height: 30px;
    width: 30px;
    outline: none;
    border: none;
    background-color: orange;
    border: 2px solid #000000;
}
</style>
</head>

<body>

<button id="startNavigation"></button>
<script src="popup.js"></script>

</body>
<html>

The extension just consists of a button and a script file. On clicking this button, navigation will start.

Important : Navigation will stop if this button loses focus.

popup.js

'use strict';

// list of urls to navigate
let urls_list = [
	'https://website.com/page-1',
	'https://website.com/page-2',
	'https://website.com/page-3',
	'https://website.com/page-4',
	'https://website.com/page-5',
];

// start navigation when #startNavigation button is clicked
startNavigation.onclick = function(element) {
	// query the current tab to find its id
	chrome.tabs.query({active: true, currentWindow: true}, async function(tabs) {
		for(let i=0; i<urls_list.length; i++) {
			// navigate to next url
			await goToPage(urls_list[i], i+1, tabs[0].id);
			
			// wait for 5 seconds
			await waitSeconds(5);
		}

		// navigation of all pages is finished
		alert('Navigation Completed');
	});
};

async function goToPage(url, url_index, tab_id) {
	return new Promise(function(resolve, reject) {
		// update current tab with new url
		chrome.tabs.update({url: url});
		
		// fired when tab is updated
		chrome.tabs.onUpdated.addListener(function openPage(tabID, changeInfo) {
			// tab has finished loading, validate whether it is the same tab
			if(tab_id == tabID && changeInfo.status === 'complete') {
				// remove tab onUpdate event as it may get duplicated
				chrome.tabs.onUpdated.removeListener(openPage);

				// fired when content script sends a message
				chrome.runtime.onMessage.addListener(function getDOMInfo(message) {
					// remove onMessage event as it may get duplicated
					chrome.runtime.onMessage.removeListener(getDOMInfo);

					// save data from message to a JSON file and download
					let json_data = {
						title: JSON.parse(message).title,
						h1: JSON.parse(message).h1,
						url: url
					};

					let blob = new Blob([JSON.stringify(json_data)], {type: "application/json;charset=utf-8"});
					let objectURL = URL.createObjectURL(blob);
					chrome.downloads.download({ url: objectURL, filename: ('content/' + url_index + '/data.json'), conflictAction: 'overwrite' });
				});

				// execute content script
				chrome.tabs.executeScript({ file: 'script.js' }, function() {
					// resolve Promise after content script has executed
					resolve();
				});
			}
		});
	});
}

// async function to wait for x seconds 
async function waitSeconds(seconds) {
	return new Promise(function(resolve, reject) {
		setTimeout(function() {
			resolve();
		}, seconds*1000);
	});
}

Content script script.js code :

let page_title = document.title,
	page_h1_tag = '';

if(document.querySelector("h1") !== null)
	page_h1_tag = document.querySelector("h1").textContent;

// prepare JSON data with page title & first h1 tag
let data = JSON.stringify({ title: page_title, h1: page_h1_tag });

// send message back to popup script
chrome.runtime.sendMessage(null, data);
  • async-await is used to write down asynchronous code in a synchronous-styled code.
  • Between each page, we are waiting for a few seconds. This is also written in async-await style.
  • The event chrome.tabs.onUpdated is fired when the tab is updated with the new url. We also wait for the page to be loaded.
  • We need to execute a content script within each page to get data from the page's DOM. The content script sends the data back as a message.
  • The chrome.runtime.onMessage is fired when the message is received from the content script.
  • We create a new Blob object using JSON data, and create a local object URL out of it. That object URL is given as the download url.
  • Scraped data is saved as content/1/data.json, content/2/data.json etc in the Downloads directory.

Other Resources

Comments
Loading Comments