from lxml import etree from bs4 import BeautifulSoup import requests def readalight(address): html = requests.get(address).content.decode('utf-8') ##获取网页代码 dom_tree = etree.HTML(html) ###XPath匹配 links = dom_tree.xpath('//div[@id="mainCnt"]/p/text()') summary = dom_tree.xpath('//p[@class="summary"]/text()') for i in summary: print(i) for i in links: print("<p>"+i+"</p>") return #-*- coding: UTF-8 -*- f = open('url.txt','r', encoding='UTF-8') line = f.readline() while line: #print line,面跟 ',' 将忽略换行符 print(line, end = '') readalight(line) line = f.readline() f.close()
以上为 python 逐条读取网址,xpath采集数据方案