以下代码将读取IDC圈网四川IDC的所有服务商链接,其工作流程是先读取IDC商名字列表页,再从详细页中读取网址信息,学习抓取和正则的一定要收藏哈。
代码详见内页看,在asp环境下测试通过。

[code lang="vb"]
<%
pagebody=httpget("http://dh.idcquan.com/sichuan/")
a1="并正在开展IDC业务的企业"
a2="</table>"
tmpbody=cutestr(pagebody,a1,a2)
Set reg=New RegExp
reg.IgnoreCase = True
reg.Global = True
reg.MultiLine = True
reg.Pattern = "<a.+?href=(['""]?)([^>\s]+)\1.*?>([\S\s]+?)<\/a>"
Set MC = reg.Execute(tmpbody)
For Each m In MC
ckURL = m.SubMatches(1)
idcname = m.SubMatches(2)
a1="网  址</STRONG>:"
a2="</TD>"
pagebody=httpget(ckURL)
urlinfo=cutestr(pagebody,a1,a2)
response.Write idcname & "&nbsp;&nbsp;&nbsp;&nbsp;" & urlinfo & vbcrlf & "<br>"
Next
'完
function cutestr(str,s1,s2)
on error resume next
l1=instr(str,s1)+len(s1)
l2=instr(l1,str,s2)
cutestr=mid(str,l1,l2-l1)
end function

function httpget(url)
on error resume next
set xmlhttp=createobject("Msxml2.XMLHTTP")
xmlhttp.open "GET",url,False
xmlhttp.send
httpget = BytesToBstr(xmlhttp.ResponseBody)
set xmlhttp=nothing
end function

Function BytesToBstr(body)
set objstream = Server.CreateObject("adodb.stream")
objstream.Type = 1
objstream.Mode =3
objstream.Open
objstream.Write body
objstream.Position = 0
objstream.Type = 2
objstream.Charset = "GB2312"
BytesToBstr = objstream.ReadText
objstream.Close
set objstream = nothing
End Function
%>[/code]

Original resource:Click here

分类: Asp

发表评论

电子邮件地址不会被公开。 必填项已用*标注