(2) First launch Desktop version of Chrome Browser and goto this website "http://aastocks.com/en/stocks/market/calendar.aspx?type=5"
(3) Install "JQuery Inject" as Chrome Extension and enable it in current browser session.
(4) Open Chrome Developer tools (Ctrl-Shift-I) and select console tab to enter the following code. Enter the code in 3 steps.
- Chrome Browser console code : Step 1 Select all
// Step 1
// define arrays for scraped objects and should be store as global variable
var scrapeResults = [];
- Console Code : Step 2 Select all
//Step 2
// function to scrape page
$("table.CalendarResultTable > tbody > .crtRow").each((index, element) => {
const tds = $(element).find("td");
if (index===0) {
previousDate = $(tds[0]).text();
}
if ($(tds[0]).text().trim()==='') {
}
else {
previousDate = $(tds[0]).text();
}
const date = previousDate;
const namecell = $(tds[1]).find("a");
const name = $(tds[1]).text().replace('\n','').split(/[0-9]+.HK/)[0].trim();
const stockcode = $(namecell).text();
const stockurl = $(namecell).attr("href");
const dividend = $(tds[2]).text().trim().split('D:')[1];
const dividenddate = $(tds[3]).text().trim().split(/Ex-Date: | Payable: | Book Close: /);
const exdate = dividenddate[1]
const payable = dividenddate[2]
const bookclose = dividenddate[3]
const scrapeResult = { date, name, stockcode, stockurl, dividend, exdate, payable, bookclose };
//console.log(scrapeResult);
if (!scrapeResults.find(({stockcode}) => stockcode === scrapeResult.stockcode)) {
scrapeResults.push(scrapeResult);
}
});
// copy to clipboard.
copy(scrapeResults);
- Console Code : Step 3 Select all
// Step 3
// define download function for webAPI
function download(content, fileName, contentType) {
var a = document.createElement("a");
var file = new Blob([content], {type: contentType});
a.href = URL.createObjectURL(file);
a.download = fileName;
a.click();
}
// download json to local folder
var jsonData = JSON.stringify(scrapeResults);
var currentdate = new Date();
download(jsonData, currentdate.getFullYear()
+ ('0'+(currentdate.getMonth()+1)).slice(-2)
+ ('0'+currentdate.getDate()).slice(-2) + '_'
+ ('0'+currentdate.getHours()).slice(-2)
+ ('0'+currentdate.getMinutes()).slice(-2)
+ ('0'+currentdate.getSeconds()).slice(-2)+'_stockjson.txt', 'text/plain');
(5) The same jquery code function above can be used in nodejs script for automation. Just add "request request-promise cheerio" packages to the project
(6) For nodejs, the save function should be
- nodejs script Select all
var fs = require('fs');
fs.writeFile("json.txt", jsonData, function(err) {
if (err) {
console.log(err);
}
});
(7) For browser console code without jQuery inject or don't want to import the jQuery library, and have to use querySelectorAll() function and use Object.values to convert to object as demo below.
- console code Select all
// Goto http://www.aastocks.com/en/stocks/market/calendar.aspx?type=1 and then open Browser (Chrome, Firefox, Safari) developer tools using ( Cmd + Opt + I in mac or Ctrl + Shift + I in win) and enter the following console code to run.
var scrapeResults = '';
document.querySelectorAll('tr.crtRow').forEach(function(item) {
const first = Object.values(item.querySelectorAll('.first'))[0];
if (typeof first !== 'undefined' && first !== null) {
console.log(first.textContent??first.textContent.trim());
scrapeResults = scrapeResults.concat(first.textContent??first.textContent.trim(), ' ');
};
const second = Object.values(item.querySelectorAll('td.second'))[0];
if (typeof second !== 'undefined' && second !== null) {
console.log(second.textContent??second.textContent.trim());
scrapeResults = scrapeResults.concat(second.textContent??second.textContent.trim(), ' ');
};
const third = Object.values(item.querySelectorAll('td.minw4'))[0];
if (typeof third !== 'undefined' && third !== null) {
console.log(third.textContent??third.textContent.trim());
scrapeResults = scrapeResults.concat(third.textContent??third.textContent.trim(), ' ');
};
const last = Object.values(item.querySelectorAll('td.last.minw1'))[0];
if (typeof last !== 'undefined' && last !== null) {
console.log(last.textContent??last.textContent.trim());
scrapeResults = scrapeResults.concat(last.textContent??last.textContent.trim(), '\n');
};
});
// copy results to clipboard.
copy(scrapeResults);
(7.1) For browser console code and use Array.from and map function to return json data.
- console code Select all
//
//// Goto http://www.aastocks.com/en/stocks/market/calendar.aspx?type=1 and then open Browser (Chrome, Firefox, Safari) developer tools using ( Cmd + Opt + I in mac or Ctrl + Shift + I in win) and enter the following console code to run.
var data = Array.from(
document.querySelectorAll('.crtRow')
).map(
row => Array.from(row.children).map(node => node.textContent.trim())
).map(
(row) => row[0].length === 0 ? [...row.slice(1)] : row
).map(
(row, idx, arr) => {
if (row.length === 1) return null;
const getLastMatch = (idx, arr) =>
arr[idx].length === 4 ? arr[idx] : getLastMatch(idx - 1, arr);
const match = getLastMatch(idx, arr);
const isSameDate = row.length === 3;
console.log(''.concat(match[0],' ',row[1 - isSameDate *1],' ',row[2 - isSameDate *1],' ',row[3 - isSameDate *1],'\n'));
return {
date:match[0],
stock:row[1- isSameDate *1],
code:row[1 - isSameDate *1].slice(-8).slice(0,5),
industry:row[2 - isSameDate *1],
period:row[3 - isSameDate *1 -3]
}
}).filter(Boolean);
console.log(data);
copy(data);
(8) Another example for nodejs scrapping code as demo below.
- ronaldo.js Select all
// need to install npm install request
// run with node ronaldo.js
const request = require("request-promise");
const url = "https://www.transfermarkt.com/cristiano-ronaldo/alletore/spieler/8198/plus/1"
headers = { 'User-Agent' : 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.93 Safari/537.36' } ;
const jsdom = require("jsdom");
const { JSDOM } = jsdom;
const options = {
url: url,
timeout: 300000,
headers:headers
};
async function scrape() {
try {
const htmlResult = await request.get(options);
const dom = new JSDOM(htmlResult);
const { document } = dom.window;
var data = Array.from(
document.querySelectorAll('.responsive-table table tbody tr')
).map(
row => Array.from(row.children).map(node => node.textContent.trim())
).map(
(row) => row.length === 15 ? [...row.slice(0, 5), ...row.slice(6)] : row
).map(
(row, idx, goals) => {
if (row.length === 1) return null;
const getLastMatch = (idx, goals) =>
goals[idx].length === 14 ? goals[idx] : getLastMatch(idx - 1, goals);
const match = getLastMatch(idx, goals);
const isSameMatch = row.length === 14;
return {
competition: match[1],
matchday: match[2],
date: match[3],
venue: match[4],
opponent: match[7],
result: match[8],
position: match[9],
minute: row[1 + isSameMatch * 9],
atScore: row[2 + isSameMatch * 9],
goalType: row[3 + isSameMatch * 9],
assist: row[4 + isSameMatch * 9],
}
}
).filter(Boolean) // filter null
.filter(x => (new Date(x.date)).getFullYear() >= 2021) // filter year
console.log(data);
} catch (err) { // try catch
console.error(err);
}
}
scrape();
No comments:
Post a Comment