一头乱码's OffIcE

2009年3月11日星期三

BBS_Post_Surveillant_v1.0——用来监测新主题贴和新回复,我听说标题长看得人会多

思路:分析源代码，用正则匹配所需要的信息进行判断
分析图:

# BBS_Post_Surveillant1.0.py

import urllib2,re,threading

from time import ctime,sleep

# 下面几个变量用来作全局变量

#主要的有记录刷新前的帖子标题、最后发表日期、最后发表ID

#刷新后的帖子标题、最后发表日期、最后发表ID

first_loop=True

new_post=True

regex_match_title=[]

regex_match_date=[]

regex_match_lastpost=[]

regex_match_title_old=[]

regex_match_date_old=[]

regex_match_lastpost_old=[]

# 线程调用的函数 匹配出标题、最后发表日期、和最后发表ID

# 因为功能一样仅仅参数不一样所以用线程来节省时间

def RegexMatch(regmatch,listindex):

    global regex_match_title

    global regex_match_date

    global regex_match_lastpost

    regex_match_title=[]

    regex_match_date=[]

    regex_match_lastpost=[]

    

    # 获取网页源代码

    

    f=urllib2.urlopen('http://bbs.cfan.com.cn/forum-48-1.html')

    file_text=f.read()

    f.close()

    for reg_temp in re.findall(regmatch,file_text):

        if listindex==0:

            regex_match_title.append(reg_temp)

        elif listindex==1:

            regex_match_date.append(reg_temp)

        else:

            regex_match_lastpost.append(reg_temp)

            

# 函数的名字没起好-_-!!!实际上是用来甄选出更新的帖子的信息的

            

def Threadworking():

    global new_post

    global first_loop

    global regex_match_title

    global regex_match_date

    global regex_match_lastpost

    global regex_match_title_old

    global regex_match_date_old

    global regex_match_lastpost_old

    regex_match_element=['\s*<span\s*id="thread_\d*"><a\s*href="thread-\d*-\d*-\d*\.html".*?>(.+?)</a></span>','<em><a\s*href="redirect\.php\?tid=\d*&goto=lastpost#lastpost">(\d*-\d*-\d*\s* \d*:\d*)</a></em>','<cite>by <a href="space.php\?action=viewpro&username=.+?">(.+?)</a></cite>']

    # 创建线程以及执行

    

    threads=[]

    nloops=range(len(regex_match_element))

    for i in nloops:

        t=threading.Thread(target=RegexMatch,args=(regex_match_element,i))

        threads.append(t)

    for i in nloops:

        threads.start()

    for i in nloops:

        threads.join()

        # 对帖子的判断部分

        

    if first_loop==True:

        first_loop=False

        len_new=range(len(regex_match_title))

        for i in len_new:

            regex_match_title_old.append(regex_match_title)

            regex_match_date_old.append(regex_match_date)

            regex_match_lastpost_old.append(regex_match_lastpost)

    else:

        len_new=range(len(regex_match_title))

        len_old=range(len(regex_match_title_old))

        for i in len_new:

            for j in len_old:

                if regex_match_title==regex_match_title_old[j]:

                    new_post=False

                    if regex_match_date!=regex_match_date_old[j]:

                        print '《'+regex_match_title+'》'+' 有新回复  '+'   最后发表ID:'+regex_match_lastpost+'   最后发表日期：'+regex_match_date

                        print '==============================无与伦比的分割线=============================='

                        break

                    elif regex_match_lastpost!=regex_match_lastpost_old[j]:

                        print '《'+regex_match_title+'》'+' 有新回复'+'   最后发表ID:'+regex_match_lastpost+'   最后发表日期：'+regex_match_date

                        print '==============================无与伦比的分割线=============================='

                        break

            if new_post==True:

                print '有新主题贴：'+'《'+regex_match_title+'》'+'   最后发表ID：'+regex_match_lastpost+'   最后发表日期：'+regex_match_date

                print '==============================无与伦比的分割线=============================='

            else:

                new_post=True

        regex_match_title_old=[]

        regex_match_date_old=[]

        regex_match_lastpost_old=[]

        for i in len_new:

            regex_match_title_old.append(regex_match_title)

            regex_match_date_old.append(regex_match_date)

            regex_match_lastpost_old.append(regex_match_lastpost)

            

# 主函数不停地获取源文件的代码并交给调用函数分析 期间睡眠10秒

def main():

    while True:

        Threadworking()

        sleep(10)

if __name__=='__main__':

    main()

运行结果:

注意：自己的回复也会判断成新回复，需要的话可以把登录的ID也匹配出来判断下

郑重声明：此程序本人原创，只在本人blogger、百度博客、QQ空间和电脑爱好者官方论坛编程版发布，除此之外未经本人授权散布者均为侵权，本人保留诉讼权
授权链接: http://sruing.blogspot.com
http://hi.baidu.com/sruingking
http://bbs.cfan.com.cn/thread-840294-1-1.html

2009年3月2日星期一

多线程获得帖子标题和最后发表时间

练练Py的正则

import urllib2,re,threading

from time import ctime

def RegexMatch(regmatch):

    for reg_temp in re.findall(regmatch,file):

        print reg_temp,ctime()

regex_match_element=['\s*<span\s*id="thread_\d*"><a\s*href="thread-\d*-\d*-\d*\.html">(.+?)</a></span>','<em><a\s*href="redirect\.php\?tid=\d*&goto=lastpost#lastpost">(\d*-\d*-\d*\s* \d*:\d*)</a></em>']

def main():

    global file

    f=urllib2.urlopen('http://bbs.cfan.com.cn/forum-53-1.html')

    file=f.read()

    threads=[]

    nloops=range(len(regex_match_element))

    for i in nloops:

        t=threading.Thread(target=RegexMatch,args=(regex_match_element[i],))

        threads.append(t)

    for i in nloops:

        threads[i].start()

    for i in nloops:

        threads[i].join()

    print '结束'

if __name__=='__main__':

    main()

2009年3月1日星期日

获取帖子标题(精简版)

import urllib2,re

f=urllib2.urlopen('http://bbs.cfan.com.cn/forum-53-1.html')

reg1=u'\s*<span\s*id="thread_\d*"><a\s*href="thread-\d*-\d*-\d*\.html">(.+?)</a></span>'

for reg in re.findall(reg1,f.read()):

    print reg

print '结束'

2009年2月28日星期六

论坛监视器第一步——获取帖子标题

正则方面感觉和原来的小有区别 MS零宽断言没有了零宽断言完成的任务让分组替代了简单了很多

写了一个获取当前页面帖子标题的小程序练习一下

import urllib2,re

f=urllib2.urlopen('http://bbs.cfan.com.cn/forum-53-1.html#stickthread_787399')

fp=file('html.txt','w')

fp.write(f.read())

fp.close

reg1='(\s*<span\s*id="thread_\d*"><a\s*href="thread-\d*-\d*-\d*\.html">)(.*)(</a></span>)'

fp=file('html.txt','r')

while True:

    line = fp.readline()

    if len(line) == 0:

        break

    reg2=line

    try:

        reg=re.search(reg1,reg2).group(2)

        print reg

        print

    except AttributeError:

        continue

fp.close

print "分析结束!"

运行后的截图

骨头在选择方法的时候明显的比我老道我写的太累赘了

2009年2月16日星期一

VC++代理刷流量——一段优美的托管代码

参考MSDN拼凑出来的代码没有一行是我写的但是实现了使用代理访问网页
稍加修饰就能实现刷流量的功能了但是我不想在写了没什么意思

#include "stdafx.h"



#using <System.dll>

using namespace System;

using namespace System::IO;

using namespace System::Net;

using namespace System::Text;

int _tmain(int argc, _TCHAR* argv[])

{

    WebProxy^ proxyObject = gcnew WebProxy( "http://121.22.29.182:80/",true );

    // Create a request for the URL.   

    WebRequest^ req = WebRequest::Create( "http://hi.baidu.com/sruingking" );

    req->Proxy = proxyObject;

    // Get the response.

    HttpWebResponse^ response = dynamic_cast<HttpWebResponse^>(req->GetResponse());

    response->Close();

    return 0;

}

2009年2月12日星期四

更新下代码

其实就是加了个把发生的错误地址写进文件的功能可以多刷几次如果几次错误地址都一样基本就可以确定那个地址不能用了

# ProxyBat_v1.2.py

import urllib2

# 生成错误日志

def errortype():

    global line,num_Error

    if(num_Error==0):

        f_log.write("==========保护视力的分割线==========\n")

    num_Error=num_Error+1

    f_log.write(line)

    f_log.write('\n')

# 用来记录发生错误的个数

num_Error=0

# 用来存放代理地址的文件

f_proxy=file('proxy.txt','r')

f_log=file('ErrLog.txt','a')

# 读取文件每一行的代理地址

while True:

    line=f_proxy.readline()

    if len(line)==0:

        break

# 使用自己的代理地址，注意ProxyHandler()的参数必须是字典类型

# build_opener()创建一个实例句柄

# 使用这个句柄通过open方法访问目标网址

# 为了便于知道哪些代理地址不能用，这里我抛出了两个已知异常，并输出异常代理地址

# 最后打印出发生异常的个数

    try:

        proxy_handler = urllib2.ProxyHandler({'http':'http://'+line})

        opener = urllib2.build_opener(proxy_handler)

        opener.open('http://hi.baidu.com/sruingking/blog/item/f77d04dec3345a5394ee3794.html')

    except urllib2.URLError:

        print 'URLError! The bad proxy is %s' % line

        errortype()

    except urllib2.HTTPError:

        print 'HTTPError! The bad proxy is %s' % line

        errortype()

    except:

        print 'Unknown Error! The bad proxy is %s' % line

        errortype()

f_proxy.close()

f_log.write('There are '+str(num_Error)+' Errors''\n')

f_log.close()

print '%d Errors' % num_Error

郑重声明：此程序本人原创，只在本人blogger、百度博客、QQ空间和电脑爱好者官方论坛编程版发布，除此之外未经本人授权散布者均为侵权，本人保留诉讼权
授权链接: http://sruing.blogspot.com
http://hi.baidu.com/sruingking
http://bbs.cfan.com.cn/thread-840294-1-1.html

2009年2月9日星期一

【python】博客人气助手第三波——代理刷流量

写在前面的话

在使用python编程之前，你要确保你的计算机里安装了python。UNIX/LINUX的用户不

用另外的安装python，他们已经存在在你的计算机里，windows用户请到官方网站下载

python，推荐下载版本2.5.4高版本可能会出现不能理解的问题（本人已经遇到，3.0的IDLE

里不能运行程序）。安装完python要设置环境变量：我的电脑->属性->高级->环境变量，点

击选中path，然后再点击编辑，在最后加上;c:\pythonXX（XX是你的版本号如：25）确定

完成设置。

如图：

检测您的Python是否能用

我们来检查下您是否设置对了python：WIN-R->CMD 输入python 如果出现如下截图

那么恭喜您，您的python已经可以正常使用了（3.0版的不能保证-_-!!!）

一个简单的实例——博客刷人数第三波之代理刷流量

这是一个十分简单程序，程序我做了注释，相信您看下就会明白

# ProxyBat_v1.1.py 

import urllib2 

# 用来记录发生错误的个数 

num_Error=0 

# 用来存放代理地址的文件 

f=file('proxy.txt','r') 

# 读取文件每一行的代理地址 

while True: 

    line=f.readline() 

    if len(line)==0: 

        break 

# 使用自己的代理地址，注意ProxyHandler()的参数必须是字典类型 

# build_opener()创建一个实例句柄 

# 使用这个句柄通过open方法访问目标网址 

# 为了便于知道哪些代理地址不能用，这里我抛出了两个已知异常，并输出异常代理地址 

# 最后打印出发生异常的个数 

    try: 

        proxy_handler = urllib2.ProxyHandler({'http':'http://'+line}) 

        opener = urllib2.build_opener(proxy_handler) 

        opener.open('http://hi.baidu.com/sruingking') 

    except urllib2.URLError: 

        print 'URLError! The bad proxy is %s' % line 

        num_Error=num_Error+1 

    except urllib2.HTTPError: 

        print 'HTTPError! The bad proxy is %s' % line 

        num_Error=num_Error+1 

    except: 

        print 'Unknown Errors! The bad proxy is %s' % line 

        num_Error=num_Error+1 

f.close() 

print '%d Errors' % num_Error

下面是一个输出异常的截图:

写在后面的话

程序很简单，可为了找到要用到的函数，我第一次翻看了全英文的文档，也就是Python

提供的官方文档，程序中我对几个方法和函数的注释不是很清楚，也就是怕自己的翻译误导

您，如果您看了本文对python感兴趣，您可以自己翻翻看看，上面的资料很全。

对程序的补充说明

虽然知道有的代理地址会不能用，带并不知道会刷的那么慢，满以为一瞬间就会刷完，没想到过了20分钟才将143个代理访问完，而且返回44个异常地址，实际刷了92次。实际

效果并没有预想的那么好，我的热情也就冷却了一半。Pryhon和C的区别还是比较大的，

文档上的解释也不是和MSDN一路，可能还是函数和方法的选用上有些问题，我会继续看

文档，权当学英语了，看看能不能有什么其他的发现。

郑重声明：此程序本人原创，只在本人blogger、百度博客、QQ空间和电脑爱好者官方论坛编程版发布，除此之外未经本人授权散布者均为侵权，本人保留诉讼权
授权链接: http://sruing.blogspot.com
http://hi.baidu.com/sruingking
http://bbs.cfan.com.cn/thread-840294-1-1.html

订阅：博文 (Atom)