using System;using System.Text.RegularExpressions;using System.Net;using System.IO;using System.Text;//using System.ArgumentOutOfRangeException; using System.Data;using System.Data.SqlClient; public static class RegexTest{ public static string req; public static string infor; public static string keys; public static string reg2; public static string reg3; public static string reg4; public static string reg1; public static string reg5; public static string nums; public static string mainurl; public static string mainstyle; public static string mainsinger; public static string mainsong; public static string mainspecial; public static string mainsize; public static int id = 0; public static int k; //public static static MatchCollection m; public static MatchCollection m1; public static MatchCollection m2; public static MatchCollection m3; public static MatchCollection m4; public static MatchCollection m5; public static MatchCollection mnum; public static HttpWebResponse HttpWResp; public static HttpWebRequest HttpWReq; public static SqlConnection Conn; // public static SqlCommand comm; // public static string sql; /* void Page_Load(Object sender, EventArgs e) { Conn = new SqlConnection("server=localhost;database=index;uid=baker;pwd=baker"); Conn.Open(); keys = Request["url"]; if (keys != "") sick(); // Conn.Open(); // conne.Text = Conn.State.ToString(); } void On_click(Object sender, EventArgs e) { if (url.Text != "") { keys = url.Text; sick(); } }*/ public static void sick(string keys) { try { k = 1; for (int j = 0; j < k; j++) { req = "http://search.tom.com/searchmp3.php?singer=" + HttpUtility.UrlEncode(keys, Encoding.Default) + "&items=" + (j + 1).ToString(); HttpWReq = (HttpWebRequest)WebRequest.Create(req); HttpWResp = (HttpWebResponse)HttpWReq.GetResponse(); // Response.Write(HttpWResP.StatusCode); infor = TextContent(HttpWResp); // reg = @"('(http|https|ftp|rtsp|mms|\d{1}):(\/\/|\\|\\\\){1}(([A-Za-z0-9_-]|['])+[.]){1,}([a-z0-9]{1,3})([^ \f\n\r\t\v\""\'\>]*\/)(([^ \f\n\r\t\v\""\'\>~])+[.]{1}(((m|M)(p|P)3)|((w|W)(M|m)(v|V))|((w|W)(M|m)(A|a))|((M|m)(p|P)(G|g))|((A|a)(S|s)(F|f))|((W|w)(A|a)(V|v))|((r|R)(M|m))|((m|M)(I|i)(D|d))|((A|a)(V|v)(I|i))|(M|m)(O|o)(V|v)|(S|s)(W|w)(F|f)|((R|r)(A|a)(M|m))|((G|g)(S|s))|(mpga)|((f|F)lash)))')"; // reg = @"('(http|https|ftp|rtsp|mms|\d{1}):(\/\/|\\|\\\\){1}(([A-Za-z0-9_-])+[.]){1,}(gov|mobi|name|travel|hk|sh|ac|free|ca|io|biz|info|fm|com|ws|edu|tw|no|net|jp|cz|com|cn|org|cc|tv|[0-9]{1,3})([^ \f\n\r\t\v\""\'\>]*\/)(([^ \f\n\r\t\v\""\'\>])+[.]{1}(((m|M)p3)|((w|W)mv)|((w|W)ma)|((M|m)pg)|((A|a)sf)|((W|w)av)|((r|R)m)|(M|m)ov|(S|s)wf|((f|F)lash)))')"; //Regex re=new Regex(reg); reg1 = @"(.a\shref='(http|https|ftp|rtsp|mms|\d{1}):(\/\/|\\|\\\\){1}(([A-Za-z0-9_-])+[.]){1,}([a-z0-9]{1,3})([^ \f\n\r\t\v\""\'\>]*\/)(([^ \f\n\r\t\v\""\>~])+[\.]{1}(((m|M)(p|P)3)|((w|W)(M|m)(v|V))|((w|W)(M|m)(A|a))|((M|m)(p|P)(G|g))|((A|a)(S|s)(F|f))|((W|w)(A|a)(V|v))|((r|R)(M|m))|((m|M)(I|i)(D|d))|((A|a)(V|v)(I|i))|(M|m)(O|o)(V|v)|(S|s)(W|w)(F|f)|((R|r)(A|a)(M|m))|(gs)|(asx)|(flv)|(mpga)|((f|F)lash)))')\s+target=_blank.+[^(\n)/]"; reg2 = @"(?<=nbsp;)[0-9\.]{1,5}(M|K)"; reg3 = @"(?<=searchmp3.php.singer=)\w*([^\""]*)*"; reg4 = @"(?<=searchmp3.php.special=)\w*([^\""]*)*"; nums = @"(?<=找到)[^首]*"; // reg5 = @"(?<=.br.{2})(((m|M)(p|P)3)|((w|W)(M|m)(v|V))|((w|W)(M|m)(A|a))|((M|m)(p|P)(G|g))|((A|a)(S|s)(F|f))|((W|w)(A|a)(V|v))|((r|R)(M|m))|((m|M)(I|i)(D|d))|((A|a)(V|v)(I|i))|(M|m)(O|o)(V|v)|(S|s)(W|w)(F|f)|((R|r)(A|a)(M|m))|((G|g)(S|s))|((f|F)lash)))[^\s]"; //ObjReg.Pattern = "<[^>]+>|</[^>]+>"; // m = Regex.Matches(infor, reg, RegexOptions.IgnorePatternWhitespace | RegexOptions.ExplicitCapture); m2 = Regex.Matches(infor, reg2, RegexOptions.IgnorePatternWhitespace | RegexOptions.ExplicitCapture); m3 = Regex.Matches(infor, reg3, RegexOptions.IgnoreCase | RegexOptions.IgnorePatternWhitespace | RegexOptions.ExplicitCapture); m4 = Regex.Matches(infor, reg4, RegexOptions.IgnoreCase | RegexOptions.IgnorePatternWhitespace | RegexOptions.ExplicitCapture); m1 = Regex.Matches(infor, reg1, RegexOptions.IgnoreCase | RegexOptions.IgnorePatternWhitespace | RegexOptions.ExplicitCapture); mnum = Regex.Matches(infor, nums, RegexOptions.IgnorePatternWhitespace | RegexOptions.ExplicitCapture); //num.Text = mnum[0].Value + "---" + m1.Count.ToString() + "(" + m2.Count.ToString() + ")" + "(" + m3.Count.ToString() + ")" + "(" + m4.Count.ToString() + ")";//+ "(" + m5.Count.ToString() + ")" string sum = Regex.Replace(mnum[0].Value, "<[^>]*>", ""); k=(int.Parse(sum))/20; if (k > 20) k = 20; int sumber = m1.Count; for (int ii = 0; ii < sumber; ii = ii + 2) { string mainurl1 = Regex.Replace(m1[ii].Value, @".a\shref=\'", ""); mainurl = Regex.Replace(mainurl1, @"\'\s+target=_blank.+[^(\n)/]", ""); mainstyle = Regex.Replace(mainurl, @"[^\.]*\.", ""); mainsong = Regex.Replace(m1[ii].Value, "<[^>]*>", ""); mainsinger = m3[ii / 2].Value; mainspecial = m4[ii / 2].Value; mainsize = m2[ii / 2].Value; /* html.Text += "<tr align=left><td>" + ((i + 2) / 2).ToString() + "</td>"; html.Text += "<td ><a href=" + mainurl+ ">" + mainsong+ "</a></td>"; // html.Text += "<td Width=40%>"+((i + 2) / 2).ToString() + ":<a color=green href=" + m[i].Value + "> " + m[i].Value + "</a></td>"; html.Text += "<td align=left>" +mainsize + "</td>"; html.Text += "<td align=left ><a href=search.aspx?singer=" +mainsinger+ ">"+mainsinger+"</a></td>"; html.Text += "<td align=left ><a href=search.aspx?special=" + mainspecial +">"+ mainspecial + "</a></td>"; html.Text += "<td align=left>" + mainstyle + "</td>"; html.Text += "</tr>"; */ datainsert(mainurl, mainsong, mainsinger, mainspecial, mainsize, mainstyle); // Console.WriteLine((i/2).ToString()); } // html.Text += "</table>"; id++; Console.WriteLine(id.ToString() + ": " + keys + " is added successfully! counts= " + (sumber / 2).ToString()); HttpWResp.Close(); } } catch (Exception ee) { //Console.Response.WriteLine( "no connect!"); } } public static Encoding GetEncoding(HttpWebResponse response) { string name = response.ContentEncoding; Encoding code = Encoding.Default; if (name == "") { string contentType = response.ContentType; if (contentType.ToLower().IndexOf("charset") != -1) { name = contentType.Substring(contentType.ToLower().IndexOf("charset=") + "charset=".Length); } } if (name != "") { try { code = Encoding.GetEncoding(name); } catch { } } return code; } public static string TextContent(HttpWebResponse response) { string buffer = "", line; Stream stream = response.GetResponseStream(); StreamReader reader = new StreamReader(stream, GetEncoding(response)); //buffer = "<base href=http://localhost:1080 />"; while ((line = reader.ReadLine()) != null) { buffer += line + "\r\n"; } stream.Close(); return buffer; } public static void datainsert(string url1, string song1, string singer1, string special1, string sizes, string style) { string url = Regex.Replace(url1,@"[\""\'|]",""); string song=Regex.Replace(song1,@"[\""\'|]",""); string singer=Regex.Replace(singer1,@"[\""\'|]",""); string special=Regex.Replace(special1,@"[\""\'|]",""); String sql = "insert into infor(url,song,singer,special,sizes,type) values('"+url +"','"+ song+"','"+ singer+" ','"+ special+"','"+ sizes +"',' "+ style+" ')"; SqlCommand comm = new SqlCommand(sql, Conn); comm.ExecuteNonQuery(); } public static void Main() { Conn = new SqlConnection("server=localhost;database=index;uid=baker;pwd=baker"); Conn.Open(); StreamReader objReader = new StreamReader("C:\\s.txt"); string sLine ; sLine = objReader.ReadLine(); // ArrayList arrText = new ArrayList(); while (sLine != null) { sick(sLine); Console.WriteLine(sLine); sLine = objReader.ReadLine(); } objReader.Close(); }} 关于原理,其实最简单,不用多说,域名,已经注册,还不错吧。 测试过速度还行。 程序中包括 将搜索结果插入数据库,曾经有一天晚上 足足抓了 26万条信息。

评论