欢迎您访问程序员文章站本站旨在为大家提供分享程序员计算机编程知识!
您现在的位置是: 首页  >  IT编程

python—正则表达式实例

程序员文章站 2023-03-25 09:18:33
如果要学会Python爬虫,那么正则表达式是不可或缺的技能。在下收集了一些关于正则表达式的代码,多多练习,多多学习#match函数应用import reprint(re.match("done|quit",'d!one!done'))print(re.match("\dcom","www.4comrunoob.5com"))#search函数应用import reprint(re.search("done|quit",'d!one!done'))print(re.search("\dcom"...

如果要学会Python爬虫,那么正则表达式是不可或缺的技能。在下收集了一些关于正则表达式的代码,多多练习,多多学习

#match函数应用
import re
print(re.match("done|quit",'d!one!done'))
print(re.match("\dcom","www.4comrunoob.5com"))
#search函数应用
import re
print(re.search("done|quit",'d!one!done'))
print(re.search("\dcom","www.4comrunoob.5com"))
import re
result=re.search(r"(\w)(?!.*\1)","abc@cslg.edu.cn")
print(result)

#match对象使用,一般情况的子模式,例1
import re
m=re.match(r"(\w+) (\w+)", "Isaac Newton, physicist")
print(m.group(0))
print(m.group(1))
print(m.group(2))
print(m.group(1,2))
m=re.match(r"(\w+) \1", "Isaac Isaac, physicist")
print(m.group(0))

#match对象使用,一般情况的子模式,例2
import re
m=re.match(r"(\d+)\.(\d+)","24.556")
print(m.groups())
print(m.group(0),m.group(1),m.group(2))



#match对象使用,命名子模式,例1
import re
m=re.match(r"(?P<first_name>\w+)(?P<last_name>\w+)", "Isaac Newton, physicist")
print(m.groupdict())
print(m.group())
print(m.group("first_name"))
print(m.group("last_name"))
print(m.groups())
print(m.group(0),m.group(1),m.group(2))


#match对象使用,命名子模式,例2
m = re.match(r"(?P<first_name>\w+) (?P<last_name>\w+)", "Malcolm Reynolds")
print(m.groupdict())



#findall函数,直接用re模块
import re
s="aabc abcd abbcd aacd"
print(re.findall("aa",s))

s = 'aabc abcd abbcd abccd abcdd'
print(re.findall(r"(\b\w*(?P<f>\w+)(?P=f)\w*\b)",s))

import re
result=re.findall(r"(\w)(?!.*\1)","abc@cslg.edu.cn")
print(result)



#findall函数,正则表达式对象
import re
tt="Tina is a good girl, she is cool, clever, and so on..."
pattern=re.compile(r"\w*oo\w*")
print(pattern.findall(tt))

s = 'aabc abcd abbcd abccd abcdd'
print(re.findall(r"(\b\w*(?P<f>\w+)(?P=f)\w*\b)",s))


#finditer函数,直接用re模块
import re
s="aabc abcd abbcd aacd"
print(re.finditer("aa",s))
iter_re=re.finditer("aa",s)
for item in iter_re:
    print(item.group(0),item.group(),item.groups(),item.groupdict())

s = 'aabc abcd abbcd abccd abcdd abab'
print(re.findall(r"(\b\w*(?P<f>\w)(?P=f)\w*\b)",s))
iter_re=re.finditer(r"(\b\w*(?P<f>\w)(?P=f)\w*\b)",s)
for item in iter_re:
    print(item.group(),item.group(0),item.groups(),item.groupdict())





#re模块处理字符串,split函数
import re
text="alpha. beta....gamma delta"
result=re.split('[\.]+',text)
print(result)
result=re.split('[\.]+',text,maxsplit=2)
print(result)
result=re.split('[\.]+',text,maxsplit=1)
print(result)

result=re.split("\.+|\s","hello world...d.t",2)
print(result)
result=re.split("\.+|\s","hello world...d.t",3)
print(result)




import re
result=re.match(r"\d{1,3}-\d{1,3}-\d{1,3}-\d{1,3}","111-234-455-233")
print(result)
if result!=None:
    print(result.group(0))







#字符串的替换,sub
import re
a=re.sub(r'\w+','10',"ji 43 af,geq",2,flags=re.I)
exp=re.compile(r'\w+',re.I)
b=exp.sub('10',"ji 43 af,geq")
print(a)
print(b)


#re模块处理字符串,sub
import re
pat='{name}'
text="Dear {name}..."
result=re.sub(pat,'Mr.Dong',text)
print(type(result),result)
s="a s d"
result=re.sub('a|s|d',"good",s)
print(result)
result=re.sub("\.+|\s","#", "hello world...d.t",3)
print(result)

#re模块处理字符串,sub
s="It's a very good good idea"
result=re.sub(r"(\b\w+) \1", r"\1",s)
print(result)
result=re.sub(r"((\w+) )\1", r"\2",s)
print(result)
print(re.findall('a','aaa abc abcd'))
result=re.sub('a',lambda x:x.group(0).upper(),'aaa abc abcd')
print(result)
result=re.sub('[a-z]',lambda x:x.group(0).upper(),'aaa abc abcd')
print(result)
result=re.sub('[a-zA-Z]',lambda x:chr(ord(x.group(0))^32),'aaa abc Abcd')
print(result)

result=re.subn('a','dfg','aaa abc adde')
print(result)
result=re.sub('a','dfg','aaa abc adde')
print(result)
result=re.escape('http://www.python.org')
print(result)





#例:去除多余的空格
import re
s="aaa    bb   c d e  fff "
print(" ".join(s.split()))
import re
print(" ".join(re.split('\s+',s.strip())))
print(" ".join(re.split('[\s]+',s.strip())))
print(re.sub('\s+'," ",s.strip()))


#删除字符串中的指定内容
import re
email="tony@tiremove_thisger.net"
m=re.search("remove_this",email)
print(email[:m.start()]+email[m.end():])
print(re.sub("remove_this","",email))
print(email.replace("remove_this",""))

#贪心和非贪心搜索
import re
email="Beautiful is better than ugly."
lst=re.findall("\\bb.+?\\b",email)
print(lst)
lst=re.findall("\\bb.+\\b",email)
print(lst)
lst=re.findall("\\bb\w*\\b",email)
print(lst)
lst=re.findall("\\Bh.+?\\b",email)
print(lst)
lst=re.findall("\\b\w.+?\\b",email)
print(lst)



lst=re.findall("\w+",email)
print(lst)
lst=re.findall(r"\b\w.+?\b",email)
print(lst)
result=re.split('\s',email)
print(result)
result=re.findall("\d+\.\d+\.\d+","python 2.7.13")
print(result)
result=re.findall("\d+\.\d+\.\d+","python 2.7.13, python 3.6.0")
print(result)



#网页匹配
import re
s="<html><head>This is head</head><body>This is a body</body></html>"
pattern=r'<html><head>(.+)</head><body>(.+)</body></html>'
result=re.search(pattern,s)
print(result.groups())
print(result.group(0),result.group(1),result.group(2))





#使用正则表达式提取电话号码
import re
text = "Suppose my Phone No. is 0535-1234567, yours is 010-12345678, his is 025-87654321."
result=re.findall(r"(\d{3,4})-(\d{7,8})",text)
print(result)
for item in result:
    print(item[0],item[1],sep="-")



#使用正则表达式查找文本中最长的数字字符
import re
def logest1(s):
    t=re.findall("\d+",s)
    if t:
        return max(t,key=len)
    return "no"

def logest2(s):
    t=re.split("[^\d]+",s)
    if t:
        return max(t,key=len)
    return "no"

print(logest1("hehe3455cat343535355"))
print(logest2("hehe3455cat343535355"))



import re
def reverse_new(s):
    t=re.split("\s+",s.strip())
    t.reverse()
    return " ".join(t)
print(reverse_new("I like beijing."))
print(reverse_new('Simple is better than complex.'))


import re
a=re.sub(r'\w+','10',"ji 43 af,geq",2,flags=re.I)
exp=re.compile(r'\w+',re.I)
b=exp.sub('10',"ji 43 af,geq")
print(a)
print(b)



import re
print(re.match('com','Conwww.runcomoob'))
print(re.match('com','Comwww.runcomoob',re.I))
print(re.match(r'\w+com\w*','Comwww.runcomoob',re.I))
print(re.search(r'\w+com\w*','Comwww.runcomoob',re.I))
print(re.findall('com','Comwww.runcomoob',re.I))


本文地址:https://blog.csdn.net/beginner_liupey/article/details/109959940