Search on blog:

Python: How to scrape gpw.pl with spółki with requests + BS

It is example code to scrape it:

#!/usr/bin/env python3

# date: 2020.04.28
# https://stackoverflow.com/questions/61481586/how-to-scrap-the-non-loaded-content-of-the-page/

import requests
from bs4 import BeautifulSoup

def get_page(session):
    url = "https://www.gpw.pl/spolki"

    response = session.get(url)
    soup = BeautifulSoup(response.content)

    all_links = soup.find_all("a")

    data = []

    for link in all_links:
        href = link.get("href")
        if "spolka?isin=" in href:
            data.append(href)

    print('\n'.join(data))


def get_ajax_data(session, offset, limit=10):
    url = 'https://www.gpw.pl/ajaxindex.php'

    data = {
        "offset": str(offset),
        "limit": str(limit),
        "format": "html",

        "action": "GPWCompanySearch",
        "start": "ajaxSearch",
        "page": "spolki",
        "lang": "PL",
        "letter": "",
        "order": "",
        "order_type": "",
        "searchText": "",
        "index[empty]": "on",
        "index[WIG20]":"on",
        "index[mWIG40]":"on",
        "index[sWIG80]":"on",
        "index[WIG30]":"on",
        "index[WIG]":"on",
        "index[WIGdiv]":"on",
        "index[WIG-CEE]":"on",
        "index[WIG-Poland]":"on",
        "index[InvestorMS]":"on",
        "index[TBSP.Index]":"on",
        "index[CEEplus]":"on",
        "index[mWIG40TR]":"on",
        "index[NCIndex]":"on",
        "index[sWIG80TR]":"on",
        "index[WIG-banki]":"on",
        "index[WIG-budownictwo]":"on",
        "index[WIG-chemia]":"on",
        "index[WIG-energia]":"on",
        "index[WIG-ESG]":"on",
        "index[WIG-górnictwo]":"on",
        "index[WIG-informatyka]":"on",
        "index[WIG-leki]":"on",
        "index[WIG-media]":"on",
        "index[WIG-motoryzacja]":"on",
        "index[WIG-nieruchomości]":"on",
        "index[WIG-odzież]":"on",
        "index[WIG-paliwa]":"on",
        "index[WIG-spożywczy]":"on",
        "index[WIG-telekomunikacja]":"on",
        "index[WIG-Ukraine]":"on",
        "index[WIG.GAMES]":"on",
        "index[WIG.MS-BAS]":"on",
        "index[WIG.MS-FIN]":"on",
        "index[WIG.MS-PET]":"on",
        "index[WIG20TR]":"on",
        "index[WIG30TR]":"on",
        "index[WIGtech]":"on",
        "sector[510]":"510","sector[110]":"110","sector[750]":"750","sector[410]":"410","sector[310]":"310","sector[360]":"360","sector[740]":"740","sector[180]":"180","sector[220]":"220","sector[650]":"650","sector[350]":"350","sector[320]":"320","sector[610]":"610","sector[690]":"690","sector[660]":"660","sector[330]":"330","sector[820]":"820","sector[399]":"399","sector[150]":"150","sector[640]":"640","sector[540]":"540","sector[140]":"140","sector[830]":"830","sector[520]":"520","sector[210]":"210","sector[170]":"170","sector[730]":"730","sector[420]":"420","sector[185]":"185","sector[370]":"370","sector[630]":"630","sector[130]":"130","sector[620]":"620","sector[720]":"720","sector[710]":"710","sector[810]":"810","sector[430]":"430","sector[120]":"120","sector[450]":"450","sector[160]":"160","sector[530]":"530","sector[440]":"440",
        "country[POLSKA]":"on","country[AUSTRALIA]":"on","country[AUSTRIA]":"on","country[Belgia]":"on","country[BUŁGARIA]":"on","country[CYPR]":"on","country[CZECHY]":"on","country[DANIA]":"on","country[ESTONIA]":"on","country[FRANCJA]":"on","country[GLOBAL]":"on","country[GUERNSEY]":"on","country[HISZPANIA]":"on","country[HOLANDIA]":"on","country[INNY]":"on","country[IRLANDIA]":"on","country[KANADA]":"on","country[LITWA]":"on","country[LUKSEMBURG]":"on","country[NIEMCY]":"on","country[Norwegia]":"on","country[REPUBLIKA+CZESKA]":"on","country[SŁOWACJA]":"on","country[Słowenia]":"on","country[STANY+ZJEDNOCZONE]":"on","country[SZWAJCARIA]":"on","country[SZWECJA]":"on","country[UKRAINA]":"on","country[WĘGRY]":"on","country[WIELKA+BRYTANIA]":"on","country[WŁOCHY]":"on","country[JERSEY]":"on",
        "voivodship[11]":"on","voivodship[16]":"on","voivodship[5]":"on","voivodship[13]":"on","voivodship[17]":"on","voivodship[7]":"on","voivodship[2]":"on","voivodship[10]":"on","voivodship[8]":"on","voivodship[4]":"on","voivodship[15]":"on","voivodship[9]":"on","voivodship[6]":"on","voivodship[3]":"on","voivodship[12]":"on","voivodship[14]":"on"
    }    

    response = requests.post(url, data=data)
    soup = BeautifulSoup(response.content)

    all_links = soup.find_all("a")

    data = []

    for link in all_links:
        href = link.get("href")
        if "spolka?isin=" in href:
            data.append("https://www.gpw.pl/spolki/" + href)

    return data

# --- main ---

s = requests.Session()
s.headers.update({'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/80.0.3987.163 Chrome/80.0.3987.163 Safari/537.36'})

#data = get_page(s)
#print('\n'.join(data))

limit = 10
for offset in range(0, 10*limit, limit):
    print('---', offset, '---')
    data = get_ajax_data(s, offset, limit)
    print('\n'.join(data))
If you like it
Buy a Coffee