��/��/��ţ

��

python��֪ʶ��4-��ý��re��

��Դ�� - Python��С��

��е��٣��֡�

��ů��꺮�ġ�

��Լ��οͣ��κ��

��Ψ��ѭ��Ѱ��

ͽ����ľ��

��̽��ʹ��һ��̽��һ��Ľ��⣬��re��⣬̽��ʲô��﷨�Լ��ȼ��з��

Part 1 - re��ʲô��

re��ʽ��ʽ��ļ�ƣ��ֱ��Ϊregex/regexp��regular expression�ļ�ơ�re�Ǽ��ѧ�е�һ���Ƕ��ַ��/�ı��в��һ��߼��ʽ��ȶ��õ��ض��ַ��Լ��Щ�ַ��ض��ϣ��ϳ�һ��ַ��ͨ��ַ��ַ��/�ı��Ĺ��߼��Ӷ��õ��Ҫ��ݽ��

re��һ��ͨ�õĹ��Ϊ��ʹ�÷��㣬python��reģ�飬��ǿ��ͨ��import re�﷨��python�н��е��á�

Part 2 - ��﷨

̽��﷨��Ϊ��Ҫ��֣��ַ��﷨��﷨��ַ��﷨��ض��ַ��ţ��﷨��ƥ��ַ��ŵ��

2.1 �ַ��﷨

2.2 ��﷨

2.3 ��ʾʽ

Part 3 - ��

3.1 re.match()

re.match()Ϊ��ƥ�亯��re.match(pattern, string, flags=0)��patternΪƥ��stringΪԭʼ�ı��flagsΪ��ѡ��Ϊƥ��ģʽ��re.Sģʽ��.(��)��й��򣬼��з��ڵ��ַ��

re.match()��ܴ�string��ĸ��ƥ�䣬��ܴ��м�ƥ�䣬ͬʱ��ܷ��ƥ��ĵ�һ��ƥ��һ��ԡ�

importre

contents ='hello, my phone number is 123456789.'

# re.matchƥ��

#��ı��﷽��-û�д��ַ��ĸƥ��

result = re.match('(\d+)',contents)

print(result.group(1))

#��ȷ�ı��﷽��

result = re.match('hello.*?(\d+)',contents)

print(result.group(1))

3.2 re.search()

re.search()Ϊ��ƥ�亯��re.search(pattern, string, flags=0)��patternΪƥ��stringΪԭʼ�ı��flagsΪ��ѡ��Ϊƥ��ģʽ��re.Sģʽ��.(��)��й��򣬼��з��ڵ��ַ��

re.search()�ֲ��match��ܴ��ĸ��ƥ��ȱ�ݣ�search��Դ��ı��λ�ý��в�ѯƥ�䣬��ͬ��ܷ��ƥ��ĵ�һ��ƥ��

# re.search()

# re.search()��ô��ĸ��ʼƥ��

result = re.search('(\d+)',contents)

print(result.group(1))

ע�⣺re.match()��re.search()��ֻ�ܷ��ص�һ��ƥ��Ľ��ƥ��з��ݣ��˾��һ��ԣ��Ҫʵ�ֶ�ƥ�䣬��ʱ��Ҫʹ��findall��

3.3 re.findall()

re.findall()Ϊ��ƥ�亯��re.findall(pattern, string, flags=0)��ڷ��з��ƥ��Ľ��patternΪƥ��stringΪԭʼ�ı��flagsΪƥ��ģʽ��re.Sģʽ��.(��)��й��򣬼��з��ڵ��ַ��

re.findall��ֲ��match/search��ȱ�ݣ��Է��ƥ��н��

# re.findall()

contents ='hello 001, I am 002'

result = re.findall('(\d+)',contents)

print(result)

# ��б��ʽ��['001', '002']

3.4 re.sub()

re.sub��滻��re.sub(pattern, repl, string)��patternΪ��滻�ַ��Ĳ��ҹ��replΪ�滻�ַ��stringΪԭʼ�ı��

# re.sub()

contents ='hello 001, I am ��С��'

result = re.sub('(\d+)','��̽��',contents)

print(result)

# ��صĽ��Ϊ'hello ��̽��, I am ��С��'

3.5 re.compile()

re.compile��ı��뷽��re.compile(pattern, flags=0)��ƥ��Ϊƥ��󣬷��ʱ��ε��ã��patternΪƥ��flagsΪƥ��ģʽ��Ϊ��ѡ��

# re.compile()

contents ='hello 001, I am 002'

patterns = re.compile('\d+')

result = re.findall(patterns,contents)

print(result)

# ��صĽ��ͬ��Ϊ�б��ʽ��['001', '002']

3.6 ƥ��-re.S

# ƥ��

# ��﷽��-û��ʹ��re.Sƥ��ģʽ

contents ='''hello my phone number is 1234

5678, please call me'''

result = re.search('is (.*?),',contents)

print(result.group(1))

# ��쳣��ܷ��ȷ��

# ��ȷ�ı��﷽��

result = re.search('is (.*?),',contents,re.S)

print(result.group(1))

# ��ȷƥ��'1234\n5678'

Part 4 - С��

# С��-è��ŵ�Ӱ

importrequests

importre

url ='http://maoyan.com/board'

headers = {'User-Agent':'Mozilla/5.0 (Windows '+

'NT 6.3; Win64; x64) AppleWebKit/'+

'537.36 (KHTML, like Gecko) Chrome/'+

'66.0.3359.181 Safari/537.36'}

# ��ҳ��󣬻�ȡ��ҳ��

defget_html(ulr):

response = requests.get(url,headers=headers)

response.encoding ='utf-8'

html = response.text

returnhtml

# ��̣��ȡ��Ҫ��

defparse_html(content):

patterns = re.compile('class="board-index.*?>(.*?)'+

'.*?

.*?data-val.*?>(.*?)'+

'.*?

(.*?)

.*?

'"releasetime">(.*?)

.*?class="integer">'+

'(.*?).*?fraction">(.*?)',re.S)

result = re.findall(patterns,content)

foriteminresult:

yield{

'rank': item[],

'name': item[1],

'actor': item[2].strip().split('��ݣ�')[1],

'releasetime': item[3].strip().split('��')[1],

'score': item[4] + item[5]

}

if__name__ =='__main__':

html = get_html(url)

result = parse_html(html)

foriteminresult:

print(item)

��Եõ��ȡ��ݵĽ��ʾ��

��Ͼ��ǳ��ý��re��ʹ�÷��̽��ϣ��ܹ��㣡��κ��⣬��ӭ��̽��ϵ��̽��-->ѧϰ��п��Լ�̽��΢�ţ��߽��Ի��

ãã�˺�� ڴ�� ʱ��

��: 2018-05-202018-05-20 18:00:44
ԭ��https://kuaibao.qq.com/s/20180520G10BOJ00?refer=cp_1026
��Ѷ��Ѷ�ƿ��Ѷ��ݿ��ƽ̨�ʺţ��ţ��֮һ��Ѷ��ݿ��ƽ̨��Э�顷ת�ط��ݡ�
��Ȩ��ϵ cloudcommunity@tencent.com ɾ��

��Ѷ

ɨ��

��վ�� Ⱥ

��ȡר�� 10Ԫ��ż�ȯ

˽�� ��ɻ�

python��֪ʶ��4-��ý��re��

��Ѷ

ɨ��

��

�

��Դ

��

��Ѷ�ƿ��

��Ų�Ʒ

��Ƽ�

��Ƽ�

python����֪ʶ��4-���ý�����re����

�����Ѷ

����

�

��Դ

����

��Ѷ�ƿ�����

���Ų�Ʒ

�����Ƽ�

�����Ƽ�

python��֪ʶ��4-��ý��re��

��Ѷ

��

��

��Ѷ�ƿ��

��Ų�Ʒ

��Ƽ�

��Ƽ�