CSS反爬案例

CSS位置偏移反爬案例

1
2
3
4
5
6
7
8
9
10
11
12
13
<!--可以看到此处的书名分为了多个span节点,并且顺序打乱-->
<h3 data-v-7f1a77ef="" class="m-b-sm name">
<span data-v-7f1a77ef="" class="char" style="left: 16px;"></span>
<span data-v-7f1a77ef="" class="char" style="left: 48px;"></span>
<span data-v-7f1a77ef="" class="char" style="left: 0px;"></span>
<span data-v-7f1a77ef="" class="char" style="left: 32px;"></span>
</h3>
<!--可以看到position属性为absolute,代表绝对定位-->
<!--根据left值进行排序后渲染到页面上-->
.name .char[data-v-7f1a77ef] {
display: inline-block;
position: absolute;
}
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
from selenium import webdriver
from selenium.common.exceptions import TimeoutException
from selenium.webdriver import ChromeOptions
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import logging
import re

logging.basicConfig(level=logging.INFO,
format='%(asctime)s - %(levelname)s: %(message)s')

option = ChromeOptions()
option.add_experimental_option('excludeSwitches', ['enable-automation'])
option.add_experimental_option('useAutomationExtension', False)
browser = webdriver.Chrome(options=option, executable_path='/Users/keeep/Downloads/tool/chromedriver')
browser.execute_cdp_cmd('Page.addScriptToEvaluateOnNewDocument', {
'source': 'Object.defineProperty(navigator, "webdriver", {get: () => undefined})'
})
wait = WebDriverWait(browser, 10)
URL_PAGE = 'https://antispider3.scrape.center/'


def get_page():
try:
browser.get(URL_PAGE)
# 为了能够获取有效信息,这里采用visibility_of_element_located获取
# 可见性意味着元素不仅被显示 但也有一个大于 0 的高度和宽度
wait.until(EC.visibility_of_element_located(locator=(By.XPATH, '//h3[contains(@class,"name")]/span')))
except TimeoutException:
logging.error('error occurred while scraping %s', URL_PAGE, exc_info=True)


def get_title():
info_list = []
elements = browser.find_elements(By.XPATH, '//h3[contains(@class,"name")]')
for element in elements:
element_find_elements = element.find_elements(By.XPATH, './span')
info = {}
if len(element_find_elements) != 0:
for span_ele in element_find_elements:
ele_text = span_ele.text
if len(ele_text) != 0:
style = span_ele.get_attribute('style')
search = int(re.search('\\d+', style).group())
info[search] = ele_text
keys = sorted(info.keys())
output_list = [''.join([info[key] for key in keys])]
info_list.append(output_list)
else:
info_list.append(element.text)


if __name__ == '__main__':
try:
get_page()
get_title()
finally:
browser.close()

字体内容放置在CSS中案例

1
2
3
4
5
<p data-v-090744c8="" class="score m-t-md m-b-n-sm">
<span data-v-090744c8=""><i data-v-090744c8="" class="icon icon-789">::before</i></span>
<span data-v-090744c8=""><i data-v-090744c8="" class="icon icon-981">::before</i></span>
<span data-v-090744c8=""><i data-v-090744c8="" class="icon icon-504">::before</i></span>
</p>
1
2
3
4
5
6
7
8
9
10
11
.icon-504:before {
content: "8"
}

.icon-789:before {
content: "9"
}

.icon-981:before {
content: "."
}

所以最后渲染出结果为9.8由上面可以发现前端将数据隐藏在了CSS文件中,找到该css文件进行请求

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
import requests
from selenium import webdriver
from selenium.common.exceptions import TimeoutException
from selenium.webdriver import ChromeOptions
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import logging
import re

logging.basicConfig(level=logging.INFO,
format='%(asctime)s - %(levelname)s: %(message)s')
option = ChromeOptions()
option.add_experimental_option('excludeSwitches', ['enable-automation'])
option.add_experimental_option('useAutomationExtension', False)
browser = webdriver.Chrome(options=option, executable_path='/Users/keeep/Downloads/tool/chromedriver')
browser.execute_cdp_cmd('Page.addScriptToEvaluateOnNewDocument', {
'source': 'Object.defineProperty(navigator, "webdriver", {get: () => undefined})'
})
wait = WebDriverWait(browser, 10)
RE_RULE = '.icon-(.*?):before\{content:"(.*?)"\}'
URL_PAGE = 'https://antispider4.scrape.center/'
response = requests.get('https://antispider4.scrape.center/css/app.654ba59e.css')
pattern = re.compile(RE_RULE)
findall = re.findall(pattern, response.text)
# 获取字典
info = {list_result[0]: list_result[1] for list_result in findall}


def get_page():
try:
browser.get(URL_PAGE)
wait.until(EC.visibility_of_element_located(locator=(By.XPATH, '//p[contains(@class,"score")]')))
except TimeoutException:
logging.error('error occurred while scraping %s', URL_PAGE, exc_info=True)


def get_info():
elements = browser.find_elements(By.XPATH, '//div[contains(@class,"item")]')
for element in elements:
# 在前一个元素获取的情况下获取其子类
element.find_element(By.XPATH, './/a[@class="name"]/h2')
title = element.text
score = get_score(element)
print(title, score)


def get_score(element):
global join
ele_find_elements = element.find_elements(By.XPATH, './/p[contains(@class,"score")]/span/i')
score = []
for find_element in ele_find_elements:
class_attr = find_element.get_attribute('class')
re_class = re.search('icon-(\d+)', class_attr).group(1)
info_get = info.get(re_class)
score.append(info_get)
join = "".join(score)
return join
赏个🍗吧
0%