Saturday, May 1, 2021

How to use Chrome browser to scrape website using javascript

(1) This is to demo how to scrape website using Chrome Browser and save the json text results to local drive.

(2) First launch Desktop version of Chrome Browser and goto this website "http://aastocks.com/en/stocks/market/calendar.aspx?type=5"

(3) Install "JQuery Inject" as Chrome Extension and enable it in current browser session.

(4) Open Chrome Developer tools (Ctrl-Shift-I) and select console tab to enter the following code. Enter the code in 3 steps.
Chrome Browser console code : Step 1   Select all
// Step 1 // define arrays for scraped objects and should be store as global variable var scrapeResults = [];

Console Code : Step 2   Select all
//Step 2 // function to scrape page $("table.CalendarResultTable > tbody > .crtRow").each((index, element) => { const tds = $(element).find("td"); if (index===0) { previousDate = $(tds[0]).text(); } if ($(tds[0]).text().trim()==='') { } else { previousDate = $(tds[0]).text(); } const date = previousDate; const namecell = $(tds[1]).find("a"); const name = $(tds[1]).text().replace('\n','').split(/[0-9]+.HK/)[0].trim(); const stockcode = $(namecell).text(); const stockurl = $(namecell).attr("href"); const dividend = $(tds[2]).text().trim().split('D:')[1]; const dividenddate = $(tds[3]).text().trim().split(/Ex-Date: | Payable: | Book Close: /); const exdate = dividenddate[1] const payable = dividenddate[2] const bookclose = dividenddate[3] const scrapeResult = { date, name, stockcode, stockurl, dividend, exdate, payable, bookclose }; //console.log(scrapeResult); if (!scrapeResults.find(({stockcode}) => stockcode === scrapeResult.stockcode)) { scrapeResults.push(scrapeResult); } }); // copy to clipboard. copy(scrapeResults);
If there are more than 1 page, click to goto next page and Repeat Step 2 to scrape again. After finished with all the pages then run Step 3 code to download to local drive. scrapeResults should be stored as Global variable for this to work,

Console Code : Step 3   Select all
// Step 3 // define download function for webAPI function download(content, fileName, contentType) { var a = document.createElement("a"); var file = new Blob([content], {type: contentType}); a.href = URL.createObjectURL(file); a.download = fileName; a.click(); } // download json to local folder var jsonData = JSON.stringify(scrapeResults); var currentdate = new Date(); download(jsonData, currentdate.getFullYear() + ('0'+(currentdate.getMonth()+1)).slice(-2) + ('0'+currentdate.getDate()).slice(-2) + '_' + ('0'+currentdate.getHours()).slice(-2) + ('0'+currentdate.getMinutes()).slice(-2) + ('0'+currentdate.getSeconds()).slice(-2)+'_stockjson.txt', 'text/plain');


(5) The same jquery code function above can be used in nodejs script for automation. Just add "request request-promise cheerio" packages to the project

(6) For nodejs, the save function should be
nodejs script   Select all
var fs = require('fs'); fs.writeFile("json.txt", jsonData, function(err) { if (err) { console.log(err); } });


(7) For browser console code without jQuery inject or don't want to import the jQuery library, and have to use querySelectorAll() function and use Object.values to convert to object as demo below.
console code   Select all
// Goto http://www.aastocks.com/en/stocks/market/calendar.aspx?type=1 and then open Browser (Chrome, Firefox, Safari) developer tools using ( Cmd + Opt + I in mac or Ctrl + Shift + I in win) and enter the following console code to run. var scrapeResults = ''; document.querySelectorAll('tr.crtRow').forEach(function(item) { const first = Object.values(item.querySelectorAll('.first'))[0]; if (typeof first !== 'undefined' && first !== null) { console.log(first.textContent??first.textContent.trim()); scrapeResults = scrapeResults.concat(first.textContent??first.textContent.trim(), ' '); }; const second = Object.values(item.querySelectorAll('td.second'))[0]; if (typeof second !== 'undefined' && second !== null) { console.log(second.textContent??second.textContent.trim()); scrapeResults = scrapeResults.concat(second.textContent??second.textContent.trim(), ' '); }; const third = Object.values(item.querySelectorAll('td.minw4'))[0]; if (typeof third !== 'undefined' && third !== null) { console.log(third.textContent??third.textContent.trim()); scrapeResults = scrapeResults.concat(third.textContent??third.textContent.trim(), ' '); }; const last = Object.values(item.querySelectorAll('td.last.minw1'))[0]; if (typeof last !== 'undefined' && last !== null) { console.log(last.textContent??last.textContent.trim()); scrapeResults = scrapeResults.concat(last.textContent??last.textContent.trim(), '\n'); }; }); // copy results to clipboard. copy(scrapeResults);


(7.1) For browser console code and use Array.from and map function to return json data.
console code   Select all
// //// Goto http://www.aastocks.com/en/stocks/market/calendar.aspx?type=1 and then open Browser (Chrome, Firefox, Safari) developer tools using ( Cmd + Opt + I in mac or Ctrl + Shift + I in win) and enter the following console code to run. var data = Array.from( document.querySelectorAll('.crtRow') ).map( row => Array.from(row.children).map(node => node.textContent.trim()) ).map( (row) => row[0].length === 0 ? [...row.slice(1)] : row ).map( (row, idx, arr) => { if (row.length === 1) return null; const getLastMatch = (idx, arr) => arr[idx].length === 4 ? arr[idx] : getLastMatch(idx - 1, arr); const match = getLastMatch(idx, arr); const isSameDate = row.length === 3; console.log(''.concat(match[0],' ',row[1 - isSameDate *1],' ',row[2 - isSameDate *1],' ',row[3 - isSameDate *1],'\n')); return { date:match[0], stock:row[1- isSameDate *1], code:row[1 - isSameDate *1].slice(-8).slice(0,5), industry:row[2 - isSameDate *1], period:row[3 - isSameDate *1 -3] } }).filter(Boolean); console.log(data); copy(data);


(8) Another example for nodejs scrapping code as demo below.
ronaldo.js   Select all
// need to install npm install request // run with node ronaldo.js const request = require("request-promise"); const url = "https://www.transfermarkt.com/cristiano-ronaldo/alletore/spieler/8198/plus/1" headers = { 'User-Agent' : 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.93 Safari/537.36' } ; const jsdom = require("jsdom"); const { JSDOM } = jsdom; const options = { url: url, timeout: 300000, headers:headers }; async function scrape() { try { const htmlResult = await request.get(options); const dom = new JSDOM(htmlResult); const { document } = dom.window; var data = Array.from( document.querySelectorAll('.responsive-table table tbody tr') ).map( row => Array.from(row.children).map(node => node.textContent.trim()) ).map( (row) => row.length === 15 ? [...row.slice(0, 5), ...row.slice(6)] : row ).map( (row, idx, goals) => { if (row.length === 1) return null; const getLastMatch = (idx, goals) => goals[idx].length === 14 ? goals[idx] : getLastMatch(idx - 1, goals); const match = getLastMatch(idx, goals); const isSameMatch = row.length === 14; return { competition: match[1], matchday: match[2], date: match[3], venue: match[4], opponent: match[7], result: match[8], position: match[9], minute: row[1 + isSameMatch * 9], atScore: row[2 + isSameMatch * 9], goalType: row[3 + isSameMatch * 9], assist: row[4 + isSameMatch * 9], } } ).filter(Boolean) // filter null .filter(x => (new Date(x.date)).getFullYear() >= 2021) // filter year console.log(data); } catch (err) { // try catch console.error(err); } } scrape();

No comments: