Python分析智联&前程无忧Email中应聘人员信息

这个脚本是帮我们公司的人力在筛选简历时写的,当时正值校招季节,被拜托实现在Email中提取这些人的职位、姓名、电话、邮箱,一开始准备是在foxmail等软件基础上进行实现,后来发现此方法行不通,后来转变一下思路,直接用Python接回来,然后正则匹配写文件不得了,SO..就有了一下的脚本。

其中,前程无忧的Email是经过Base64编码的,所以需要先解开,然后在进行正则匹配。

脚本是帮人力写的,他们也不懂Python,所以最后用py2exe给打了一个exe的包,就让他们看看了,本来想有空再加上一些时间的判断等机制,后来也没再找我,就这样了,估计也就是用了一次…

配置文件

1
2
3
4
[info]
pop3 = xxxx.xxxx.com
email = xxxx@xxxx.com
password = xxxxxxxx

getMail.py

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
#-*- encoding: gbk -*-
#!/usr/bin/python
import base64
import poplib
import re
import email
import ConfigParser
import sys
import os
reload(sys)
sys.setdefaultencoding('utf-8')

CONFIG = ConfigParser.ConfigParser()
CONFIG.read("conf.ini")
SERVER = CONFIG.get('info', 'pop3')
Email = CONFIG.get('info', 'email')
Password = CONFIG.get('info', 'password')

emailServer = poplib.POP3(SERVER)
emailServer.user(Email)
emailServer.pass_(Password)

# 获取一些统计信息
emailMsgNum, emailSize = emailServer.stat()

for i in range(emailMsgNum):
allpiece = nonallpiece = ""
for piece in emailServer.retr(i+1)[1]:
all_the_text = ""
allpiece += piece+"\n"
nonallpiece += piece
#取智联招聘
if (re.match('^.*?Received: from(.*?).zhaopinmail.com.*?$', nonallpiece)!= None):
try:
title = allpiece.split("Subject: ")
title = title[1].split("Date: ")
title = email.Header.decode_header(title[0])[0][0].split("应聘")

title = title[1].split("-")
#print title
#name = title[2]
try:
all_the_text +=""+ title[0]+","+title[1]+","+title[2]+","+title[3]+","
except:
all_the_text +=""+ title[0]+",,"+title[1]+","+title[2]+","
mat = re.match('^.*((1)(3\d|4[5,7]|5[0-3,5-9]|8[0,2,3,6-9])\D*(\d{4})\D*(\d{4})).*$', nonallpiece)
try:
cont= mat.groups()
all_the_text += ""+cont[0]+","
except:
all_the_text += ""+"None,"
try:
mailre = re.match('^.*?mailto:(.*?)\'.*?$', nonallpiece)
#print nonallpiece
cont= mailre.groups()
all_the_text += ""+cont[0]+"\n"
except:
all_the_text += ""+"None\n"

file_object = open('Finnallist.csv', 'a')
file_object.write(all_the_text)
file_object.close( )
print all_the_text
except Exception,e:
print "ERROR!",e
else:
if re.match('^.*?Received: from(.*?).51job.com.*?$', nonallpiece)!= None:

noew = re.match('^.*?Content-Transfer-Encoding: base64(.*?)--=_.*?$', nonallpiece)
c1ont = noew.groups()
c51ont = base64.decodestring(c1ont[0])
c51ont = "".join(c51ont.split())

try:
#print base64.decodestring(nonallpiece)
#b =
title = allpiece.split("subject: ")
title = title[1].split("MIME-Version:")
all_the_text +=""+ email.Header.decode_header(title[0])[0][0]+", "
#print all_the_text
mat = re.match('^.*((1)(3\d|4[5,7]|5[0-3,5-9]|8[0,2,3,6-9])\D*(\d{4})\D*(\d{4})).*$', c51ont)
try:
cont= mat.groups()
all_the_text += ""+cont[0]+","
except:
all_the_text += ""+"None,"
try:
#print nonallpiece
mailre = re.match('^.*?mailto:(.*?)".*?$', c51ont)
cont= mailre.groups()
all_the_text += ""+cont[0]+"\n"
except:
all_the_text += ""+"None\n"
file_object = open('Finnallist.csv', 'a')
file_object.write(all_the_text)
file_object.close( )
print all_the_text
except Exception,e:
print "ERROR!",e

emailServer.quit()

py2exe打包

1
2
3
4
5
from distutils.core import setup
import py2exe
includes=["email", "poplib"]
options={"py2exe":{"includes":includes,"bundle_files":1}}
setup(options=options,console=["getMail.py"])