python spider

最新推荐文章于 2025-04-09 19:45:34 发布

mrhao61

最新推荐文章于 2025-04-09 19:45:34 发布

阅读量861

点赞数

CC 4.0 BY-SA版权

分类专栏： python

本文链接：https://blog.csdn.net/weixin_43870649/article/details/101535743

python 专栏收录该内容

13 篇文章

订阅专栏

import re
'''
string="libohao"
pat="hao"
rst=re.search(pat,string)
print(rst)
'''
string='''raosjdaodj
baidu
'''
pat="\n"
rst=re.search(pat,string)
#print(rst)
#\n \t

'''
通用字符作为原子
\w 字母数字下划线
\W 除上
\d 十进制数字
\s 空白字符
'''
string1='''asdasda123132'''
pat="\w\w\d\d"
rst=re.search(pat,string1)
#print(rst)

#原子表
pat="as[abcd]"
rst=re.search(pat,string1)
#print(rst)

pat="as[^d]"
rst=re.search(pat,string1)
#print(rst)

#元字符
'''
除换行符以外的任意一个字符
^开始位置
$匹配结束
* 0\1\多次
？ 0/1次
+ 1/多次
{n} n次
{n,} 至少n次
{n,m} 至少n次，至多m次
| 模式选择符或
（）模式单元

'''
string1='''asdadasda123132'''

pat="asd+"
pat="^asd......."
pat="asd.*"
pat="da{2}"
rst=re.search(pat,string1)
#print(rst)


#模式修正符
'''
| 匹配时忽略大小写
M多行匹配
L本地化识别匹配
U Unicode
S 让匹配包括换行符
'''

string="python"
pat="Pyt"
rst=re.search(pat,string,re.I)
#print(rst)

'''贪婪模式
懒惰模式

'''
string="poythonyhjskjsa"
pat1="p.*y"#贪婪模式
pat2="p.*?y"#懒惰模式
rst1=re.search(pat1,string,re.I)
rst2=re.search(pat2,string,re.I)
#print(rst1)
#print(rst2)

'''
正则表达式函数
re.mathch()
re.search()
re.sub()
'''


string="poythonyhjskjsa"
pat="p.*y"
pat="o.*y"#match 必须从头开始
rst=re.match(pat,string,re.I)
#print(rst)

#全局匹配函数格式re.compile(正则表达式）

string="poythonyhjskjsapoythonpoythonpoythonpoython"
pat="p.*?y"
rst=re.compile(pat).findall(string)
#print(rst)

#com cn
string="http://www.baidu.com"
pat="http[s]?://[a-zA-Z*.]+[com|cn]+"
rst=re.compile(pat).findall(string)
#print(rst)


#匹配电话号码
string="asdsd1234-1234567sad123-12345678"
pat="\d{4}-\d{7}|\d{3}-\d{8}"
rst=re.compile(pat).findall(string)
#print(rst)

'''
#url基础
import urllib.request  as r
#urlretrieve(网页，本地存储地址) 直接下载网页到本地
#r.urlretrieve("https://blog.csdn.net/weixin_43870649/article/details/101419805","D:\\Desktop\\python爬虫\\download1.html")
#urlcleanup
r.urlcleanup()
#info
file=r.urlopen("https://read.douban.com/provider/all/")
#print(file.info())
#getcode返回当前爬取页面的状态
print(file.getcode())
#geturl
print(file.geturl())
'''
'''
#超时设置
import urllib.request as r
for i in range(0,100):
    file=r.urlopen("http://www.baodu.com",timeout=0.2)
    try:
        print(len(file.read()))
    except Exception as err:
        print(str(err))
'''
'''
#自动模拟http请求
#get请求
import urllib.request as r
import re
keyword="李博浩"
keyword=r.quote(keyword)
for i in range(1,1000):
    url="http://www.baidu.com/s?wd="+keyword+"&pn="+str((i-1)*10)
    file=r.urlopen(url).read().decode('utf-8')
    pat1="title:'(.*?)',"
    pat2='"title":"(.*?)"'
    rst1=re.compile(pat1).findall(file)
    rst2=re.compile(pat2).findall(file)
    for j in range(0,len(rst1)):
        print(rst1[j])
    for j in range(0,len(rst2)):
        print(rst2[j])
    
'''
'''
#post请求
import urllib.request as r
import urllib.parse as p
import re
posturl="http://39.105.30.139:8081/f7/loginadmin.action"
postdata=p.urlencode({
    "a_id":"17",
    "a_pwd":"11111111",
}).encode("utf-8")
for i in range(0,50):
    try:
        req=r.Request(posturl,postdata)
        rst=r.urlopen(req).read().decode("utf-8")
        if(re.search("论坛登录成功",rst)!="NONE"):
            print("论坛登录成功")
    except Exception as err:
            print(str(err))

'''