作者:谦谦骄子_458 | 来源:互联网 | 2023-08-10 11:20
我正在尝试抓取这个网站:https : //case.occ.ok.gov/ords/f? p = 1004: 203
拼图的缺失部分是弄清楚如何p_request
在发出最终请求之前“获取”数据负载中的参数。在查看“主”页面时,此字段为空,因此无法使用它传递给我的 POST 请求。
下面的代码不起作用,因为我p_request
在有效负载中有一个空白参数,尽管我通过开发人员控制台的测试知道如果我能够获得该p_request
字段,它将起作用。
# Query Main Site to Build Payload
url = 'https://case.occ.ok.gov/ords/f?p=1004:203'
r = requests.get(url)
soup = BeautifulSoup(r.text,'lxml')
# Get COOKIE
COOKIEs = {}
cookdat = r.COOKIEs
COOKIEs['ORA_WWV_APP_1004'] = cookdat.get('ORA_WWV_APP_1004')
COOKIEs['X-Oracle-BMC-LBS-Route'] = cookdat.get('X-Oracle-BMC-LBS-Route')
# Create Payload
inputs = soup.select('input')
d_inputs = {i['id']:i.get('value','') for i in inputs}
data = [
('p_flow_id', '1004'),
('p_flow_step_id', '203'),
('p_instance', '%s'%d_inputs['pInstance']),
('p_debug', ''),
('p_request', ''),
('p_widget_name', 'worksheet'),
('p_widget_mod', 'PULL'),
('p_widget_action', ''),
('p_widget_num_return', '100000'),
('x01', '8980043036046866'),
('x02', '8985720770049096'),
('f01', 'R8980010866046866_column_search_current_column'),
('f01', 'R8980010866046866_search_field'),
('f01', 'R8980010866046866_row_select'),
('f02', ''),
('f02', ''),
('f02', '50'),
('p_json', '{"pageItems":{"itemsToSubmit":[{"n":"P203_LASTNAME","v":"%s"},{"n":"P203_FIRSTNAME","v":""},{"n":"P203_SEARCH_CRITERIA","v":"1"}],"protected":"%s","rowVersion":"","formRegionChecksums":[]},"salt":"%s"}'%(letter,d_inputs['pPageItemsProtected'],d_inputs['pSalt'])),
]
# POST request retrieve data
r = requests.post('https://case.occ.ok.gov/ords/wwv_flow.ajax', COOKIEs=COOKIEs, data=data)
print(r.text)
在开发者控制台中,我看到这个字段在提交我想要的类型时出现,即使它在主页上是空白的:
我如何“检索”这个字段,这是请求工作所必需的?
回答
这对我有用
import requests
import json
from bs4 import BeautifulSoup
# globals
users = []
letter = "A"
# session
session = requests.Session()
# get page
auth = session.get('https://case.occ.ok.gov/ords/f?p=1004:203')
soup = BeautifulSoup(auth.text, 'html.parser')
inputs = soup.select('input')
d_inputs = {i['id']: i.get('value', '') for i in inputs}
# create params
params = {
'p_flow_id': d_inputs['pFlowId'],
'p_flow_step_id': d_inputs['pFlowStepId'],
'p_instance': d_inputs['pInstance'],
'p_debug': '',
'p_request': 'Search',
'p_reload_on_submit': d_inputs['pReloadOnSubmit'],
'p_page_submission_id': d_inputs['pPageSubmissionId'],
'p_json': json.dumps({"pageItems": {
"itemsToSubmit": [
{"n": "P203_LASTNAME", "v": "{}".format(letter)},
{"n": "P203_FIRSTNAME", "v": ""},
{"n": "P203_SEARCH_CRITERIA", "v": "1"}
],
"protected": d_inputs['pPageItemsProtected'],
"rowVersion": "",
"formRegionChecksums": []
},
"salt": d_inputs['pSalt']
})
}
# Send request to APEX
session.post(
'https://case.occ.ok.gov/ords/wwv_flow.accept', data=params
)
# get page with data (first)
data_page = session.get(
'https://case.occ.ok.gov/ords/f?p=1004:203:{}::NO:::'.format(
d_inputs['pInstance']
)
)
table_soup = BeautifulSoup(data_page.text, 'html.parser')
# new params
inputs = table_soup.select('input')
d_inputs = {i['id']: i.get('value', '') for i in inputs}
json_ajax_data = json.loads(data_page.text.split(
'interactiveReport('
)[1].split(');})();')[0])
# get data for next pages
params_news = {
'p_flow_id': params['p_flow_id'],
'p_flow_step_id': params['p_flow_step_id'],
'p_instance': params['p_instance'],
'p_debug': '',
'p_request': 'PLUGIN={}'.format(json_ajax_data['ajaxIdentifier']),
'p_widget_name': 'worksheet',
'p_widget_mod': 'ACTION',
'p_widget_action': 'PAGE',
'p_widget_action_mod': 'pgR_min_row=51max_rows=50rows_fetched=50',
'p_widget_num_return': 50,
'x01': d_inputs['R8980010866046866_worksheet_id'],
'x02': d_inputs['R8980010866046866_worksheet_id'],
'p_json': params['p_json']
}
# get next page data
next_page = session.post(
'https://case.occ.ok.gov/ords/wwv_flow.ajax', data=params_news
)
next_page_soup = BeautifulSoup(next_page.text, 'html.parser')
next_page_table_with_data = table_soup.find('table', {'class': 'a-IRR-table'})
next_page_rows = next_page_table_with_data.find_all('tr')
# parse rows
for row_next_page in next_page_rows:
cells_next_page = row_next_page.find_all('td')
if len(cells_next_page) > 0:
users.append(
{
'name': cells_next_page[0].text, 'surname': cells_next_page[1].text
}
)
print(users)
[
{'name': 'ANDERSON', 'surname': 'MICHAEL L AND KAREN'},
{'name': 'ALVAREZ', 'surname': 'PETRA'},
...
]