BeautifulSoup使用

beautifulsoup库

pip install beautifulsoup4

基本使用

html = """
<html><head><title>The Dormouse's title</title></head>
<body>
<p class="title" name="dromouse"><b>The Dormouse's story</b></p>
<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1"><!-- Elsie --></a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
<p class="story">...</p>
"""
from bs4 import BeautifulSoup

soup = BeautifulSoup(html, 'lxml')
# <title>The Dormouse's title</title>
print(soup.title)
# The Dormouse's title
print(soup.title.string)
# <head><title>The Dormouse's title</title></head>
print(soup.head)
# The Dormouse's story 当有多个p节点匹配成功时，只会选取第一个
print(soup.p.string)
# dromouse 获取第一个p节点属性名为name的值
print(soup.p.attrs['name'])
print(soup.p['name'])
# html节点下的所有子孙类
for child in soup.html.children:
    print(child)
# a节点的直接父节点
for i in soup.a.parent:
    print(i)
# a节点的所有祖先节点
for i in soup.a.parents:
  	print(i)
# 兄弟节点
print('下一个兄弟节点', soup.a.next_sibling)
print('上一个兄弟节点', soup.a.previous_sibling)

方法选择器

soup = BeautifulSoup(html, 'lxml')
1. find_all
# 查询所有ul节点
print(soup.find_all(name='ul'))
print(soup.find_all(attrs={'id': 'list-1'}))
print(soup.find_all(id='list-1'))
# class是关键字，所以需要加一个下划线区分
print(soup.find_all(class_='element'))
print(soup.find_all(text=re.compile('link')))
2. find
# find返回的是符合条件的单一元素
print(soup.find(name='ul'))