欢迎您访问程序员文章站本站旨在为大家提供分享程序员计算机编程知识!
您现在的位置是: 首页  >  IT编程

python的使用:写csv文件、为爬虫添加代理ip、字典的相关用法

程序员文章站 2022-09-02 22:46:55
写csv文件 import csv from urllib.request import urlopen from bs4 import BeautifulSoup...

写csv文件

import csv
from urllib.request import urlopen
from bs4 import BeautifulSoup

html = urlopen("https://en.wikipedia.org/wiki/Comparison_of_text_editors")
bsObj = BeautifulSoup(html,"lxml")
#主对比表格是当前页面的第一个表格
table = bsObj.findAll("table",{"class":"wikitable"})[0]
rows = table.findAll("tr")

csvFile = open("editors.csv","wt",newline = '',encoding = 'utf-8')
writer = csv.writer(csvFile)
try:
    for row in rows:
        csvRow = []
        for cell in row.findAll(['td','th']):
            csvRow.append(cell.get_text())
            writer.writerow(csvRow)
finally:
    csvFile.close()

抓取页面图片①

import urllib.request
response = urllib.request.urlopen('https://imgsrc.baidu.com/forum/w%3D580/sign=fdcdb5b2314e251fe2f7e4f09784c9c2/16391f30e924b89915f86eb06f061d950b7bf677.jpg')
cat_img = response.read()
with open('picture.jpg','wb')as f:
    f.write(cat_img)

抓取页面图片②

import urllib.request
import re

def getHtml(url):
    page = urllib.request.urlopen(url)
    html = page.read()
    return html

def getImg(html):
    reg = r'src="(.+?\.jpg)" pic_ext'
    imgre = re.compile(reg)
    imglist = re.findall(imgre,html)
    x = 0
    for imgurl in imglist:
        urllib.urlretrieve(imgurl,'%s.jpg' % x)
        x+=1
html = getHtml("https://tieba.baidu.com/p/2460150866")
print(getImg(html))

为爬虫添加代理ip

import urllib.request
import random

url = 'https://whatismyip.com.tw'
iplist = ['121.201.97.136:80','117.135.164.170:80','58.247.31.230:80']
proxy_support = urllib.request.ProxyHandler({'http':random.choice(iplist)})
opener = urllib.request.build_opener(proxy_support)
urllib.request.install_opener(opener)
response = urllib.request.urlopen(url)
html = response.read().decode('utf-8')
print(html)

#获取页面内嵌链接
import requests
import re
from bs4 import BeautifulSoup
from urllib.request import urlopen

rawtext=urlopen("https://bbs.gfan.com/android-8397839-1-1.html").read()
soup = BeautifulSoup(rawtext,"html.parser")
targetDiv=soup.find('p',{'class':'pg'})
catalogLinks=targetDiv.find_all('a')
indexlist = []
for l in catalogLinks[1:]:
    indexlist.append(l.get('href'))

for index in indexlist:
    print(index)

字典的相关用法

test = {
  "post": {
    "content": ""
  },
  "replys": [
    {
      "content": ""
    }
  ]
}

test["post"]["content"] = "xx"
test["replys"][0]["content"] = "yy"
test["replys"][0]["value"] = "zz"

test["replys"].append({"content":"","title":"","publish_date":""})
def store(measurements):
    import json
    with open('measurements.json', 'w') as f:
        f.write(json.dumps(test))

if __name__ == "__main__":
    store(test)