Scraping: How to get data from interactive plot created with HighCharts
On page you can see Highcharts
with "Total Coronavirus Death".
I tried to get data which it uses to display this chart.
It doesn't use AJAX to load data from other url so I couldn't read it directly. It also doesn't keep data in separted variable in JavaScript or in HTML tag.
It has all data directly in HTML in JavaScript in Highcharts.chart(....)
so I tried to get these values using different methods.
Most of them need to manually work to find elements in data and create correct indexes or xpath to get values. So it is not so easy.
The nicer was to use js2xml
which parses JavaScript code and gives XML which can use xpath
to search items.
The hardest was to use pyjsparser
which also parses JavaScript code but it gives Python's directory which doesn't have methods to search items.
I also used normal string modifications (split()
, slicing [start:end]
to get JSON data and convert to Python using eval()
or module json
(eventually dirtyjson
if data is not in pure JSON format).
import requests
from bs4 import BeautifulSoup
import json
#import dirtyjson
import js2xml
import pyjsparser
url= ''
r = requests.get(url)
soup = BeautifulSoup(r.text, "html.parser")
all_scripts = soup.find_all('script')
script = all_scripts[24].text
print('\n--- eval ---\n')
data = script.split('data: [', 1)[1].split(']', 1)[0]
data = eval(data) # it creates tuple
# text values
data = script.split("title: { text: '", 1)[-1].split("'", 1)[0]
data = script.split("title: { text: '", 3)[-1].split("'", 1)[0]
print('\n--- json ---\n')
data = script.split('data: [', 1)[1].split(']', 1)[0]
data = '[' + data + ']' # create correct JSON data
data = json.loads(data) # this time doesn't need `dirtyjson`
# text values
data = script.split("title: { text: '", 1)[-1].split("'", 1)[0]
data = script.split("title: { text: '", 3)[-1].split("'", 1)[0]
print('\n--- js2xml ---\n')
data = js2xml.parse(script)
print(data.xpath('//property[@name="data"]//number/@value')) # nice and short xpath
# text values
text = data.xpath('//property[@name="title"]//string/text()')
print('\n--- pyjsparser ---\n')
data = pyjsparser.parse(script)
data = data['body'][0]['expression']['arguments'][1]['properties'][-2]['value']['elements'][0]['properties'][-1]['value']['elements'] # a lot of work to find it
#print(json.dumps(data, indent=2))
data = [x['value'] for x in data]
# text values
# it needs work
Highcharts.chart('coronavirus-deaths-linear', { chart: { type: 'line' }, title: { text: 'Total Deaths' }, subtitle: { text: '(Linear Scale)' }, xAxis: { categories: ["Jan 22","Jan 23","Jan 24","Jan 25","Jan 26","Jan 27","Jan 28","Jan 29","Jan 30","Jan 31","Feb 01","Feb 02","Feb 03","Feb 04","Feb 05","Feb 06","Feb 07","Feb 08","Feb 09","Feb 10","Feb 11","Feb 12","Feb 13","Feb 14","Feb 15","Feb 16","Feb 17","Feb 18","Feb 19","Feb 20","Feb 21","Feb 22","Feb 23","Feb 24","Feb 25","Feb 26","Feb 27","Feb 28","Feb 29","Mar 01","Mar 02","Mar 03","Mar 04","Mar 05"] }, yAxis: { title: { text: 'Total Coronavirus Deaths' } }, legend: { layout: 'vertical', align: 'right', verticalAlign: 'middle' }, credits: { enabled: false }, series: [{ name: 'Deaths', color: '#FF9900', lineWidth: 5, data: [17,25,41,56,80,106,132,170,213,259,304,362,426,492,565,638,724,813,910,1018,1115,1261,1383,1526,1669,1775,1873,2009,2126,2247,2360,2460,2618,2699,2763,2800,2858,2923,2977,3050,3117,3202,3285,3387] }], responsive: { rules: [{ condition: { maxWidth: 800 }, chartOptions: { legend: { layout: 'horizontal', align: 'center', verticalAlign: 'bottom' } } }] } });
--- eval ---
(17, 25, 41, 56, 80, 106, 132, 170, 213, 259, 304, 362, 426, 492, 565, 638, 724, 813, 910, 1018, 1115, 1261, 1383, 1526, 1669, 1775, 1873, 2009, 2126, 2247, 2360, 2460, 2618, 2699, 2763, 2800, 2858, 2923, 2977, 3050, 3117, 3202, 3285, 3387)
Total Deaths
Total Coronavirus Deaths
--- json ---
[17, 25, 41, 56, 80, 106, 132, 170, 213, 259, 304, 362, 426, 492, 565, 638, 724, 813, 910, 1018, 1115, 1261, 1383, 1526, 1669, 1775, 1873, 2009, 2126, 2247, 2360, 2460, 2618, 2699, 2763, 2800, 2858, 2923, 2977, 3050, 3117, 3202, 3285, 3387]
Total Deaths
Total Coronavirus Deaths
--- js2xml ---
['17', '25', '41', '56', '80', '106', '132', '170', '213', '259', '304', '362', '426', '492', '565', '638', '724', '813', '910', '1018', '1115', '1261', '1383', '1526', '1669', '1775', '1873', '2009', '2126', '2247', '2360', '2460', '2618', '2699', '2763', '2800', '2858', '2923', '2977', '3050', '3117', '3202', '3285', '3387']
Total Deaths
Total Coronavirus Deaths
--- pyjsparser ---
[17.0, 25.0, 41.0, 56.0, 80.0, 106.0, 132.0, 170.0, 213.0, 259.0, 304.0, 362.0, 426.0, 492.0, 565.0, 638.0, 724.0, 813.0, 910.0, 1018.0, 1115.0, 1261.0, 1383.0, 1526.0, 1669.0, 1775.0, 1873.0, 2009.0, 2126.0, 2247.0, 2360.0, 2460.0, 2618.0, 2699.0, 2763.0, 2800.0, 2858.0, 2923.0, 2977.0, 3050.0, 3117.0, 3202.0, 3285.0, 3387.0]
The same data are also as table on
Similar data for coronavirus
you can find on GitHub:
EDIT: 2020.05.06
JavaScript on page has new structrure so I changed code
import requests
from bs4 import BeautifulSoup
import json
#import dirtyjson
import js2xml
import pyjsparser
# --- functions ---
def test_eval(script):
print('\n--- eval ---\n')
# chart values
text = script.split('data: [', 1)[1] # beginning
text = text.split(']', 1)[0] # end
values = eval(text) # it creates tuple
# title
# I split `yAxis` because there is other `title` without text
# I split beginning in few steps because text may have different indentations (different number of spaces)
# (you could use regex to split in one step)
text = script.split("title: {\n", 1)[1] # beginning
text = text.split("text: '", 1)[1] # beginning
title = text.split("'", 1)[0] # end
print('\ntitle:', title)
text = script.split("yAxis: {\n", 1)[1] # beginning
text = text.split("title: {\n", 1)[1] # beginning
text = text.split("text: '", 1)[1] # beginning
title = text.split("'", 1)[0] # end
print('\ntitle:', title)
def test_json(script):
print('\n--- json ---\n')
# chart values
text = script.split('data: [', 1)[1] # beginning
text = text.split(']', 1)[0] # end
text = '[' + text + ']' # create correct JSON data
values = json.loads(text) # this time doesn't need `dirtyjson`
# title
# I split `yAxis` because there is other `title` without text
# I split beginning in few steps because text may have different indentations (different number of spaces)
# (you could use regex to split in one step)
text = script.split("title: {\n", 1)[1] # beginning
text = text.split("text: '", 1)[1] # beginning
title = text.split("'", 1)[0] # end
print('\ntitle:', title)
text = script.split("yAxis: {\n", 1)[1] # beginning
text = text.split("title: {\n", 1)[1] # beginning
text = text.split("text: '", 1)[1] # beginning
title = text.split("'", 1)[0] # end
print('\ntitle:', title)
def test_js2xml(script):
print('\n--- js2xml ---\n')
data = js2xml.parse(script)
# chart values (short and nice path)
values = data.xpath('//property[@name="data"]//number/@value')
#values = [int(x) for x in values] # it may need to convert to int() or float()
#values = [float(x) for x in values] # it may need to convert to int() or float()
# title (short and nice path)
#title = data.xpath('//property[@name="title"]//string/text()')
title = data.xpath('//property[@name="title"]//string/text()')
title = title[0]
print('\ntitle:', title)
title = data.xpath('//property[@name="yAxis"]//property[@name="title"]//string/text()')
title = title[0]
print('\ntitle:', title)
def test_pyjsparser(script):
print('\n--- pyjsparser ---\n')
data = pyjsparser.parse(script)
print("body's number:", len(data['body']))
for number, body in enumerate(data['body']):
if (body['type'] == 'ExpressionStatement'
and body['expression']['callee']['object']['name'] == 'Highcharts'
and len(body['expression']['arguments']) > 1):
arguments = body['expression']['arguments']
#print(json.dumps(values, indent=2))
for properties in arguments[1]['properties']:
#print('name: >{}<'.format(p['key']['name']))
if properties['key']['name'] == 'series':
values = properties['value']['elements'][0]
values = values['properties'][-1]
values = values['value']['elements'] # a lot of work to find it
#print(json.dumps(values, indent=2))
values = [x['value'] for x in values]
# title (very complicated path)
# It needs more work to find correct indexes to get title
# so I skip this part as too complex.
# --- main ---
url= ''
r = requests.get(url)
soup = BeautifulSoup(r.text, "html.parser")
all_scripts = soup.find_all('script')
print('number of scripts:', len(all_scripts))
for number, script in enumerate(all_scripts):
#if 'data: [' in script.text:
if 'Highcharts.chart' in script.text:
print('\n=== script:', number, '===\n')
Buy a Coffee