VB 读取网页源码并分析其中的URL
昨天花了一晚上的时间写出来的。
vb获取网页源码的方法:http://www.tiancao.net/bbs/ShowPost.asp?ThreadID=6
'Option Explicit
Private Declare Function SendMessage Lib "user32" _
Alias "SendMessageA" (ByVal hwnd As Long, _
ByVal wMsg As Long, ByVal wParam As Long, _
lParam As Any) As Long
Private Const LB_SETHORIZONTALEXTENT = &H194
Private Sub Command1_Click()
On Error Resume Next
Dim HTML As String
HTML = viewSource(Me.Text2.Text)
If HTML = "" Then Exit Sub
Me.Text1.Text = HTML
Dim URLS() As String
Url_In_Html Me.Text2.Text, URLS
Dim i As Integer
Me.List1.Clear
For i = 0 To UBound(URLS)
Me.List1.AddItem URLS(i)
Next
Me.Timer1.Enabled = True
End Sub
'URL为网页地址
'URLS为网页代码中的URL组
Function Url_In_Html(URL As String, ByRef URLS() As String)
Dim i As Long, j As Integer
URL = Replace(URL, "\", "/") '将网页地址中可能含有的“\“全部替换成成”/”,这样地址中的分割符合就一致。
i = inStr_n(URL, "/") '比如给的路径是http://www.tiancao.net
If i = 2 Then URL = URL & "/"
i = InStrRev(URL, "/")
Dim URL1 As String
URL1 = Left(URL, i) '查找地址的绝对地址路径
Dim HTML As String
HTML = viewSource(URL)
If HTML = "" Then Exit Function
HTML = UCase(HTML) '将网页源码全部转换成大写
Dim N As Integer
Dim index() As Long
N = inStr_n(HTML, "HREF", index)
'MsgBox "总共有" & N & "个href标签"
Dim Temp As String
Dim Temp1 As String
Dim Temp2 As Integer
Dim Temp3 As Integer
Dim Temp4 As Integer
Dim Temp5 As Integer
Dim M As Integer
For i = 0 To N - 1
Temp = Mid(HTML, index(i) + 5, 300)
'这里取url的长度为300,如果超过则检测不到,这300个字符中可能包含下一个或几个HREF标签,但这不用担心,程序会分析每个标签的。
'为什么取那么多,是因为很多网页的URL编码可能很长,比如百度推广的广告和陶宝网的网址都很长。
For j = 2 To Len(Temp)
If Mid(Temp, j, 1) = """" Or Mid(Temp, j, 1) = ">" Or Mid(Temp, j, 1) = "'" Then
Temp1 = Left(Temp, j - 1)
Temp2 = InStr(Temp1, "+") 'URL中含有+号的(比如<a href="'+location.href+'">)
Temp3 = InStr(Temp1, "#") 'URL中含有#号的(比如<a href="#top">)
Temp4 = InStr(Temp1, " ") 'URL中含有空格的(比如<a href=http://www.tiancao.net/ target=_blank>)
Temp5 = InStr(Temp1, "MAILTO") 'URL中含有空格的(比如<a href="mailto:tiancao1001@126.com">)
'没有能检查所以的情况
If Temp2 = 0 And Temp3 = 0 And Temp4 = 0 And Temp5 = 0 Then
ReDim Preserve URLS(M)
If Left(Temp1, 1) = """" Or Left(Temp1, 1) = "'" Then Temp1 = Right(Temp1, Len(Temp1) - 1) 'URL前面可能还有个引号或单引号
If Temp2 = InStr(Temp1, ":") <> 0 Then
'存在冒号,说明是绝对路径(HTTP://),没有用判断HTTP来判断,是因为windows可以用HTTP给文件夹命名,而不可以用冒号
If Left(Temp1, 1) = "/" Or Left(Temp1, 1) = "\" Then
Temp1 = URL1 & Right(Temp1, Len(Temp1) - 1)
Else
Temp1 = URL1 & Temp1
End If
End If
URLS(M) = Temp1
M = M + 1
Exit For
End If
End If
Next
Next
End Function
'返回某一字符串在另一个字符串中出现的次数 index返回出现的位置数组
Public Function inStr_n(str As String, StrIn As String, Optional index As Variant) As Long
Dim i As Long
Dim Temp As Long: Temp = 1
Dim N As Long
N = 0
For i = 1 To Len(str)
Temp = InStr(Temp + 1, str, StrIn)
If Temp = 0 Then
Exit For
Else
If IsMissing(index) = False Then
ReDim Preserve index(N)
index(N) = Temp
End If
N = N + 1
End If
Next i
inStr_n = N
End Function
'查看网页的源码
Function viewSource(URL)
On Error GoTo E:
Dim XmlHttp
Set XmlHttp = CreateObject("Microsoft.XMLHTTP")
XmlHttp.Open "GET", URL, False
XmlHttp.setRequestHeader "Content-Type", "text/XML"
XmlHttp.Send
Dim HTML
HTML = bytes2BSTR(XmlHttp.responseBody)
viewSource = HTML
Exit Function
E:
viewSource = ""
End Function
'只能得到西文的字符串,中文只能显示GB2312编码。
Function bytes2BSTR(vIn)
Dim strReturn As String
Dim i As Long
Dim Thischarcode As Integer
Dim nextcharcode As Integer
strReturn = ""
For i = 1 To LenB(vIn)
Thischarcode = AscB(MidB(vIn, i, 1))
If Thischarcode < &H80 Then
strReturn = strReturn & Chr(Thischarcode)
Else
nextcharcode = AscB(MidB(vIn, i + 1, 1))
strReturn = strReturn & Chr(CLng(Thischarcode) * &H100 + CInt(nextcharcode))
i = i + 1
End If
Next
bytes2BSTR = strReturn
End Function
Private Sub Form_Load()
Me.WebBrowser1.Navigate "http://www.tiancao.net/"
Me.Timer1.Enabled = False
addHorScrlBarListBox List1
End Sub
Private Sub List1_DblClick()
Me.WebBrowser1.Navigate Me.List1.List(Me.List1.ListIndex)
End Sub
'每一分钟随机打开list中的一个连接
Private Sub Timer1_Timer()
On Error Resume Next
Dim j As Integer
j = Rnd() * Me.List1.ListCount
Me.WebBrowser1.Navigate Me.List1.List(j)
End Sub
' list加横向滚动条
Public Sub addHorScrlBarListBox(ByVal refControlListBox As Object)
Dim nRet As Long
Dim nNewWidth As Integer
nNewWidth = refControlListBox.Width * 4 ' 新宽度,以像素为单位。
nRet = SendMessage(refControlListBox.hwnd, _
LB_SETHORIZONTALEXTENT, nNewWidth, ByVal 0&)
End Sub
Private Declare Function SendMessage Lib "user32" _
Alias "SendMessageA" (ByVal hwnd As Long, _
ByVal wMsg As Long, ByVal wParam As Long, _
lParam As Any) As Long
Private Const LB_SETHORIZONTALEXTENT = &H194
Private Sub Command1_Click()
On Error Resume Next
Dim HTML As String
HTML = viewSource(Me.Text2.Text)
If HTML = "" Then Exit Sub
Me.Text1.Text = HTML
Dim URLS() As String
Url_In_Html Me.Text2.Text, URLS
Dim i As Integer
Me.List1.Clear
For i = 0 To UBound(URLS)
Me.List1.AddItem URLS(i)
Next
Me.Timer1.Enabled = True
End Sub
'URL为网页地址
'URLS为网页代码中的URL组
Function Url_In_Html(URL As String, ByRef URLS() As String)
Dim i As Long, j As Integer
URL = Replace(URL, "\", "/") '将网页地址中可能含有的“\“全部替换成成”/”,这样地址中的分割符合就一致。
i = inStr_n(URL, "/") '比如给的路径是http://www.tiancao.net
If i = 2 Then URL = URL & "/"
i = InStrRev(URL, "/")
Dim URL1 As String
URL1 = Left(URL, i) '查找地址的绝对地址路径
Dim HTML As String
HTML = viewSource(URL)
If HTML = "" Then Exit Function
HTML = UCase(HTML) '将网页源码全部转换成大写
Dim N As Integer
Dim index() As Long
N = inStr_n(HTML, "HREF", index)
'MsgBox "总共有" & N & "个href标签"
Dim Temp As String
Dim Temp1 As String
Dim Temp2 As Integer
Dim Temp3 As Integer
Dim Temp4 As Integer
Dim Temp5 As Integer
Dim M As Integer
For i = 0 To N - 1
Temp = Mid(HTML, index(i) + 5, 300)
'这里取url的长度为300,如果超过则检测不到,这300个字符中可能包含下一个或几个HREF标签,但这不用担心,程序会分析每个标签的。
'为什么取那么多,是因为很多网页的URL编码可能很长,比如百度推广的广告和陶宝网的网址都很长。
For j = 2 To Len(Temp)
If Mid(Temp, j, 1) = """" Or Mid(Temp, j, 1) = ">" Or Mid(Temp, j, 1) = "'" Then
Temp1 = Left(Temp, j - 1)
Temp2 = InStr(Temp1, "+") 'URL中含有+号的(比如<a href="'+location.href+'">)
Temp3 = InStr(Temp1, "#") 'URL中含有#号的(比如<a href="#top">)
Temp4 = InStr(Temp1, " ") 'URL中含有空格的(比如<a href=http://www.tiancao.net/ target=_blank>)
Temp5 = InStr(Temp1, "MAILTO") 'URL中含有空格的(比如<a href="mailto:tiancao1001@126.com">)
'没有能检查所以的情况
If Temp2 = 0 And Temp3 = 0 And Temp4 = 0 And Temp5 = 0 Then
ReDim Preserve URLS(M)
If Left(Temp1, 1) = """" Or Left(Temp1, 1) = "'" Then Temp1 = Right(Temp1, Len(Temp1) - 1) 'URL前面可能还有个引号或单引号
If Temp2 = InStr(Temp1, ":") <> 0 Then
'存在冒号,说明是绝对路径(HTTP://),没有用判断HTTP来判断,是因为windows可以用HTTP给文件夹命名,而不可以用冒号
If Left(Temp1, 1) = "/" Or Left(Temp1, 1) = "\" Then
Temp1 = URL1 & Right(Temp1, Len(Temp1) - 1)
Else
Temp1 = URL1 & Temp1
End If
End If
URLS(M) = Temp1
M = M + 1
Exit For
End If
End If
Next
Next
End Function
'返回某一字符串在另一个字符串中出现的次数 index返回出现的位置数组
Public Function inStr_n(str As String, StrIn As String, Optional index As Variant) As Long
Dim i As Long
Dim Temp As Long: Temp = 1
Dim N As Long
N = 0
For i = 1 To Len(str)
Temp = InStr(Temp + 1, str, StrIn)
If Temp = 0 Then
Exit For
Else
If IsMissing(index) = False Then
ReDim Preserve index(N)
index(N) = Temp
End If
N = N + 1
End If
Next i
inStr_n = N
End Function
'查看网页的源码
Function viewSource(URL)
On Error GoTo E:
Dim XmlHttp
Set XmlHttp = CreateObject("Microsoft.XMLHTTP")
XmlHttp.Open "GET", URL, False
XmlHttp.setRequestHeader "Content-Type", "text/XML"
XmlHttp.Send
Dim HTML
HTML = bytes2BSTR(XmlHttp.responseBody)
viewSource = HTML
Exit Function
E:
viewSource = ""
End Function
'只能得到西文的字符串,中文只能显示GB2312编码。
Function bytes2BSTR(vIn)
Dim strReturn As String
Dim i As Long
Dim Thischarcode As Integer
Dim nextcharcode As Integer
strReturn = ""
For i = 1 To LenB(vIn)
Thischarcode = AscB(MidB(vIn, i, 1))
If Thischarcode < &H80 Then
strReturn = strReturn & Chr(Thischarcode)
Else
nextcharcode = AscB(MidB(vIn, i + 1, 1))
strReturn = strReturn & Chr(CLng(Thischarcode) * &H100 + CInt(nextcharcode))
i = i + 1
End If
Next
bytes2BSTR = strReturn
End Function
Private Sub Form_Load()
Me.WebBrowser1.Navigate "http://www.tiancao.net/"
Me.Timer1.Enabled = False
addHorScrlBarListBox List1
End Sub
Private Sub List1_DblClick()
Me.WebBrowser1.Navigate Me.List1.List(Me.List1.ListIndex)
End Sub
'每一分钟随机打开list中的一个连接
Private Sub Timer1_Timer()
On Error Resume Next
Dim j As Integer
j = Rnd() * Me.List1.ListCount
Me.WebBrowser1.Navigate Me.List1.List(j)
End Sub
' list加横向滚动条
Public Sub addHorScrlBarListBox(ByVal refControlListBox As Object)
Dim nRet As Long
Dim nNewWidth As Integer
nNewWidth = refControlListBox.Width * 4 ' 新宽度,以像素为单位。
nRet = SendMessage(refControlListBox.hwnd, _
LB_SETHORIZONTALEXTENT, nNewWidth, ByVal 0&)
End Sub
[本日志由 田草 于 2008-06-29 10:41 PM 编辑]
|
tiancao1001 于 2022-07-21 09:18 PM 发表评论:
Dim request As System.Net.HttpWebRequest = System.Net.HttpWebRequest.Create("http://www.bycad.cn/")
request.AllowAutoRedirect = True
request.MaximumAutomaticRedirections = 50
request.ServicePoint.Expect100Continue = True
request.CookieContainer = New CookieContainer()
Dim response As System.Net.HttpWebResponse = request.GetResponse()
Dim sr As System.IO.StreamReader = New System.IO.StreamReader(response.GetResponseStream())
Dim sourcecode As String = sr.ReadToEnd()
TextBox1.Text = sourcecode
End Sub
tiancao1001 于 2010-11-29 09:51 PM 发表评论:
Private Sub Command1_Click()
Command1.Enabled = False
WebBrowser1.Navigate2 Text1.Text
End Sub
Private Sub WebBrowser1_DocumentComplete(ByVal pDisp As Object, URL As Variant)
Dim x As Long
List1.Clear
If Not WebBrowser1.Busy Then
DoEvents
End If
For x = 0 To WebBrowser1.Document.Links.length - 1
List1.AddItem WebBrowser1.Document.Links.Item(x)
Next x
Command1.Enabled = True
End Sub
Private Sub WebBrowser1_StatusTextChange(ByVal Text As String)
Label3 = Text
End Sub
发表评论 - 不要忘了输入验证码哦! |