这些是我用过最牛逼的Python脚本！你用过哪些？

2014-04-01更新：

许多人在问我是否可以写一个脚本，可以发现一个文件夹中所有电影的详细信息，因为每一次只能发现一个电影的详细信息是非常麻烦的。我已经更新了这个脚本，支持处理整个文件夹。脚本会分析这个文件夹里的所有子文件夹，从 IMDb上抓取所有电影的详细信息，然后打开一个电子表格，根据IMDb 上的排名，从高到低降序排列所有的电影。这个表格中包含了（所有电影）在 IMDb URL，年份，情节，分类，获奖信息，演员信息，以及其他的你可能在 IMBb找到的信息。下面是脚本执行后，生成的表格范例：

我个人超级喜欢 Matthew Inman 的漫画。它们在疯狂搞笑的同时，却又发人深省。但是，我很厌烦重复点击下一个，然后才能阅读每一个漫画。另外，由于每一个漫画都由多福图片组成，所以手动下载这些漫画是非常困难的。

基于如上原因，我写了一个 python 脚本，用来从这个站点下载所有的漫画。这个脚本利用 BeautifulSoup (http://www.crummy.com/software/B… ) 解析 HTML 数据，所以在运行脚本前，必须安装 BeautifulSoup。用于下载燕麦片（马修.英曼的一部漫画作品）的下载器已经上传到GitHub：theoatmeal.com-downloader 。（漫画）下载完后的文件夹是这样的 :D

Akshit Khurana，4400+ 顶

感谢 500 多个朋友在 Facebook 上为我送出的生日祝福

有三个故事让我的21岁生日变的难忘，这是最后一个故事。我倾向于在每一条祝福下亲自评论，但是使用 python 来做更好。

…

1.# Thanking everyone who wished me on my birthday

2.importrequests

3.importjson

5.# Aman's post time

6.AFTER = 1353233754

7.TOKEN = ' '

9.defget_posts():

10."""Returns dictionary of id, first names of people who posted on my wall

11. between start and end time"""

12.query = ("SELECT post_id, actor_id, message FROM stream WHERE "

13."filter_key = 'others' AND source_id = me() AND "

14."created_time > 1353233754 LIMIT 200")

15.

16.payload = {'q': query,'access_token': TOKEN}

17.r = requests.get('https://graph.facebook.com/fql',params=payload)

18.result = json.loads(r.text)

19.returnresult['data']

20.

21.defcommentall(wallposts):

22."""Comments thank you on all posts"""

23.#TODO convert to batch request later

24.forwallpost inwallposts:

25.

26.r = requests.get('https://graph.facebook.com/%s' %

27.wallpost['actor_id'])

28.url = 'https://graph.facebook.com/%s/comments' % wallpost['post_id']

29.user = json.loads(r.text)

30.message = 'Thanks %s :)' % user['first_name']

31.payload = {'access_token': TOKEN,'message': message}

32.s = requests.post(url,data=payload)

33.

34.print"Wall post %s done" % wallpost['post_id']

35.

36.if__name__ == '__main__':

37.commentall(get_posts())

…

整理照片

当我对图像处理感兴趣之后，我一直致力于研究机器学习。我写这个有趣的脚本，目的是为了分类图片，很像 Facebook 做的那样（当然这是一个不够精确的算法）。我使用了 OpenCV 的人脸检测算法，“haarcascade_frontalface_default.xml”，它可以从一张照片中检测到人脸。

你可能已经察觉到这张照片的某些地方被错误地识别为人脸。我试图通过修改一些参数（来修正这一问题），但还是某些地方被错误地识别为人脸，这是由相机的相对距离导致的。我会在下一阶段解决这一问题（训练步骤）。

我可以增加一个名字，像这个样子：

当训练了几个素材后，它会像这个样子：

最后一个是针对应对那些垃圾随机方块而使用的变通解决方案。

带名字的最终文件夹。

所以，现在寻找图片变得相当简单。顺便提一下，很抱歉（我）放大了这些照片。

…

importcv2

importsys

importos,random,string

#choices=['Add a name']

importos

current_directory=os.path.dirname(os.path.abspath(__file__))

fromTkinterimportTk

fromeasygui import *

importnumpy asnp

x= os.listdir(current_directory)

new_x=[]

testing=[]

foriinx:

ifi.find('.')==-1:

new_x+=[i]

else:

testing+=[i]

x=new_x

g=x

choices=['Add a name']+x

y= range(1,len(x)+1)

defget_images_and_labels():

globalcurrent_directory,x,y,g

ifx==[]:

return(False,False)

image_paths=[]

foriing:

path=current_directory+''+i

forfilename inos.listdir(path):

final_path=path+''+filename

image_paths+=[final_path]

# images will contains face images

images = []

# labels will contains the label that is assigned to the image

labels = []

forimage_path inimage_paths:

# Read the image and convert to grayscale

img = cv2.imread(image_path,0)

# Convert the image format into numpy array

image = np.array(img,'uint8')

# Get the label of the image

backslash=image_path.rindex('')

underscore=image_path.index('_',backslash)

nbr = image_path[backslash+1:underscore]

t=g.index(nbr)

nbr=y[t]

# If face is detected, append the face to images and the label to labels

images.append(image)

labels.append(nbr)

#cv2.imshow("Adding faces to traning set...", image)

#cv2.waitKey(50)

# return the images list and labels list

returnimages,labels

# Perform the tranining

deftrain_recognizer():

recognizer = cv2.createLBPHFaceRecognizer()

images,labels = get_images_and_labels()

ifimages==False:

returnFalse

cv2.destroyAllWindows()

recognizer.train(images,np.array(labels))

returnrecognizer

defget_name(image_path,recognizer):

globalx,choices

#if recognizer=='':

# recognizer=train_recognizer()

cascadePath = "haarcascade_frontalface_default.xml"

faceCascade = cv2.CascadeClassifier(cascadePath)

#recognizer=train_recognizer()

x1=testing

globalg

printimage_path

image = cv2.imread(image_path)

img = cv2.cvtColor(image,cv2.COLOR_BGR2GRAY)

predict_image = np.array(img,'uint8')

faces = faceCascade.detectMultiScale(

img,

scaleFactor=1.3,

minNeighbors=5,

minSize=(30,30),

flags = http://cv2.cv.CV_HAAR_SCALE_IMAGE

)

for(x,y,w,h)infaces:

f= image[y:y+w,x:x+h]

cv2.imwrite('temp.jpg',f)

im='temp.jpg'

nbr_predicted,conf = recognizer.predict(predict_image[y: y + h,x: x + w])

predicted_name=g[nbr_predicted-1]

print"{} is Correctly Recognized with confidence {}".format(predicted_name,conf)

ifconf>=140:

continue

msg='Is this '+predicted_name

reply = buttonbox(msg,image=im,choices=['Yes','No'])

ifreply=='Yes':

reply=predicted_name

directory=current_directory+''+reply

ifnotos.path.exists(directory):

os.makedirs(directory)

random_name=''.join(random.choice(string.ascii_uppercase + string.digits)for_inrange(7))

path=directory+''+random_name+'.jpg'

cv2.imwrite(path,f)

else:

msg = "Who is this?"

reply = buttonbox(msg,image=im,choices=choices)

ifreply == 'Add a name':

name=enterbox(msg='Enter the name',title='Training',strip=True)

printname

choices+=[name]

reply=name

directory=current_directory+''+reply

ifnotos.path.exists(directory):

os.makedirs(directory)

random_name=''.join(random.choice(string.ascii_uppercase + string.digits)for_inrange(7))

path=directory+''+random_name+'.jpg'

printpath

cv2.imwrite(path,f)

# calculate window position

root = Tk()

pos = int(root.winfo_screenwidth() * 0.5),int(root.winfo_screenheight() * 0.2)

root.withdraw()

WindowPosition = "+%d+%d" % pos

# patch rootWindowPosition

rootWindowPosition = WindowPosition

defdetect_faces(img):

globalchoices,current_directory

imagePath = img

faceCascade = cv2.CascadeClassifier(cascPath)

image = cv2.imread(imagePath)

gray = cv2.cvtColor(image,cv2.COLOR_BGR2GRAY)

faces = faceCascade.detectMultiScale(

gray,

scaleFactor=1.3,

minNeighbors=5,

minSize=(30,30),

flags = http://cv2.cv.CV_HAAR_SCALE_IMAGE

)

print"Found {0} faces!".format(len(faces))

m=0

for(x,y,w,h)infaces:

m+=1

padding=0

f= image[y-padding:y+w+padding,x-padding:x+h+padding]

cv2.imwrite('temp.jpg',f)

im='temp.jpg'

msg = "Who is this?"

reply = buttonbox(msg,image=im,choices=choices)

ifreply == 'Add a name':

name=enterbox(msg='Enter the name',title='Training',strip=True)

printname

choices+=[name]

reply=name

directory=current_directory+''+reply

ifnotos.path.exists(directory):

os.makedirs(directory)

random_name=''.join(random.choice(string.ascii_uppercase + string.digits)for_inrange(7))

path=directory+''+random_name+'.jpg'

printpath

cv2.imwrite(path,f)

defnew(img,recognizer):

imagePath = current_directory+''+img

printimagePath

get_name(imagePath,recognizer)

cascPath = 'haarcascade_frontalface_default.xml'

b=0

os.system("change_name.py")

forfilename inos.listdir("."):

b+=1

ifb%10==0orb==1:

os.system("change_name.py")

recognizer=train_recognizer()

iffilename.endswith('.jpg')orfilename.endswith('.png'):

printfilename

imagePath=filename

#detect_faces(imagePath)

new(imagePath,recognizer)

os.remove(filename)

raw_input('Done with this photograph')

…

PNR（Passenger Name Record旅客订座记录，下同）状态短讯

铁路方面不经常发送 PNR 状态消息。因此，我写了一个脚本，可以从印度铁路网站获取 PNR 状态。这是非常容易的，因为那个网站没有验证码，即使有，也只是形同虚设的验证码（在过去，一些字母会被写在看起来像图片一样的东西上面，因为他们为这些字母使用了一个 “check” 的背景图）。我们可以轻松地从 HTML 网页得到这些字母。我不明白他们这样做的目的是什么，难道仅仅是为了愚弄他们自己吗？不管怎么样，我使用短信息脚本来处理它，经过一段时间间隔，它会在我的笔记本上运行一次，就像是一个定时任务，只要 PNR 状态有更新，它就会把更新信息发送给我。

Nalanda 下载器

我们一般在这个叫 ‘Nalanda’ 的网站上下载一些教学课件以及其他的课程资料， ‘Nalanda’ 在 BITS Pilani (Nalanda). 我自己懒得在考试前一天下载所有的课件，所以，我写了这个这个下载器，它可以把每一门科的课件下载到相应的文件夹。

代码：

…

importmechanize,os,urllib2,urllib,requests,getpass,time

start_time = time.time()

frombs4 importBeautifulSoup

br=mechanize.Browser()

br.open('https://nalanda.bits-pilani.ac.in/login/index.php')

br.select_form(nr=0)

name=''

whilename=='':

try:

print'*******'

username=raw_input('Enter Your Nalanda Username: ')

password=getpass.getpass('Password: ')

br.form['username']=username

br.form['password']=password

res=br.submit()

response=res.read()

soup=BeautifulSoup(response)

name=str(soup.find('div',attrs={'class':'logininfo'}).a.string)[:-2]

except:

print'Wrong Password'

f=open('details.txt','w')

f.write(username+'n'+password)

f.close()

print'Welcome, '+name

print'All the files will be downloaded in your Drive C in a folder named "nalanda"'

#print soup.prettify()

div=soup.find_all('div',attrs={'class':'box coursebox'})

l=len(div)

a=[]

foriinrange(l):

d=div[i]

s=str(d.div.h2.a.string)

s=s[:s.find('(')]

c=(s,str(d.div.h2.a['href']))

path='c:nalanda'+c[0]

ifnotos.path.exists(path):

os.makedirs(path)

a+=[c]

#print a

overall=[]

foriinrange(l):

response=br.open(a[i][1])

page=response.read()

soup=BeautifulSoup(page)

li=soup.find_all('li',attrs={'class':'section main clearfix'})

x=len(li)

t=[]

folder=a[i][0]

print'Downloading '+folder+' files...'

o=[]

forjinrange(x):

g=li[j].ul

#print g

#raw_input('')

ifg!=None:

temp=http://g.li['class'].split(' ')

#raw_input('')

iftemp[1]=='resource':

#print 'yes'

#print '********************'

o+=[j]

h=li[j].find('div',attrs={'class':'content'})

s=str(h.h3.string)

path='c:nalanda'+folder

ifpath[-1]==' ':

path=path[:-1]

path+=''+s

ifnotos.path.exists(path):

os.makedirs(path)

f=g.find_all('li')

r=len(f)

z=[]

foreinrange(r):

p=f[e].div.div.a

q=f[e].find('span',attrs={'class':'resourcelinkdetails'}).contents

link=str(p['href'])

text=str(p.find('span').contents[0])

typ=''

ifstr(q[0]).find('word')!=-1:

typ='.docx'

elifstr(q[0]).find('JPEG')!=-1:

typ='.jpg'

else:

typ='.pdf'

iftyp!='.docx':

res=br.open(link)

soup=BeautifulSoup(res.read())

iftyp=='.jpg':

di=soup.find('div',attrs={'class':'resourcecontent resourceimg'})

link=di.img['src']

else:

di=soup.find('div',attrs={'class':'resourcecontent resourcepdf'})

link=di.object['data']

try:

ifnotos.path.exists(path+''+text+typ):

br.retrieve(link,path+''+text+typ)[0]

except:

print'Connectivity Issues'

z+=[(link,text,typ)]

t+=[(s,z)]

ift==[]:

print'No Documents in this subject'

overall+=[o]

#raw_input('Press any button to resume')

#print overall

print'Time Taken to Download: '+str(time.time()-start_time)+ ' seconds'

print'Do you think you can download all files faster than this :P'

print'Closing in 10 seconds'

time.sleep(10)

…

这个是主页面:

这个页面显示了所有的用户和他们的链接。因为我给 Nick 加了一个超链接，所以在链接这一拦是空的。

所以，当用户数量增加以后，这个页面会列出所有的用户列表。基本上，这个页面充当了一个你和另外一个人联系的中间人角色。我还做了一个在所有用户中搜索特定文件的功能。

这里是客户端的 python 文件（这是一段很长的代码，我上传到了 Ideone）

所有这些代码仅仅用于教育目的。

欢迎关注我的博客或者公众号有好礼相送：https://home.cnblogs.com/u/Python1234/ Python学习交流

欢迎加入我的学习交流答疑群：125240963

这些是我用过最牛逼的Python脚本！你用过哪些？

猜你喜欢