关于一个小爬虫

Author Avatar
幽浮404 2017年02月14日
  • 在其它设备中阅读本文章

实现了一个自动爬取头像的简单小程序,使用 C#实现
具体用到了 C# 的 WebClient,使用 WebClient 下载目标网页的源代码然后通过正则表达式分析网页源代码,获得头像的 URL。
因为是下载目标页面的完整源代码然后分析,所以效率比较低。

using System;
using System.Collections.Generic;
using System.Data;
using System.IO;
using System.Net;
using System.Text;
using System.Text.RegularExpressions;
using System.Windows.Forms;

namespace Crawler
{
    public partial class Form1 : Form
    {
        public Form1()
        {
            InitializeComponent();
        }
        System.Timers.Timer t = new System.Timers.Timer(100);//实例化Timer类,设置时间间隔
        List<string> lisurl = new List<string>();
        DataTable dt = new DataTable();
        int i = 0;
        int counts = 0;
        private void Form1_Load(object sender, EventArgs e)
        {
            StreamReader sr = new StreamReader(@"C:\Users\ZY\Desktop\Crawler\Crawler\bin\Debug\config.txt", Encoding.Default);
            i = int.Parse(sr.ReadLine());
            label2.Text = sr.ReadLine();
            label4.Text = sr.ReadLine();
            label6.Text = sr.ReadLine();
            //清空缓冲区
            sr.Close();
            //label2.Text = dr["UID"].ToString();
            //counts = int.Parse(label4.Text) - 1;
            //label6.Text = dt.Rows.Count.ToString();


            dt.Columns.Add("UID");
            dt.Columns.Add("URL");
            dt.Columns.Add("count");
            dt.Columns.Add("是否下载");
            label7.Text = i.ToString();
            t.Elapsed += new System.Timers.ElapsedEventHandler(Method2);//到达时间的时候执行事件
            t.AutoReset = true;//设置是执行一次(false)还是一直执行(true)
            
        }
        System.Timers.Timer tr = new System.Timers.Timer(100);
        void Method2(object source, System.Timers.ElapsedEventArgs e)
        {
            t.Enabled = false;
            i++;
            string url = "http://www.acfun.cn/u/" + i.ToString() + ".aspx#page=1";
            WebClient wc = new WebClient();
            wc.Encoding = Encoding.UTF8;
            try
            {
                string html = wc.DownloadString(url);
                MatchCollection matches = Regex.Matches(html, "<div style=\"(.+?)\".*>");
                foreach (Match item in matches)
                {

                    char reg = '\'';
                    string sss = item.Groups[1].Value.ToString();
                    string[] sArray = sss.Split(reg);
                    if (sArray[1] != "http://cdn.aixifan.com/dotnet/20120923/style/image/avatar.jpg")
                    {
                        dt.Rows.Add(i, sArray[1], dt.Rows.Count + 1, 0);
                    }

                }
            }
            catch
            {

            }
            finally
            {
                t.Enabled = true;
            }

        }

        private void button1_Click(object sender, EventArgs e)
        {
            t.Enabled = true;//是否执行System.Timers.Timer.Elapsed事件
            timer1.Enabled = true;
        }

        private void Form1_FormClosing(object sender, FormClosingEventArgs e)
        {
            FileStream fs = new FileStream(@"C:\Users\ZY\Desktop\Crawler\Crawler\bin\Debug\config.txt", FileMode.Create);
            StreamWriter sw = new StreamWriter(fs);
            //开始写入
            sw.WriteLine(i);
            sw.WriteLine(label2.Text);
            sw.WriteLine(label4.Text);
            sw.WriteLine(label6.Text);
            //清空缓冲区
            sw.Flush();
            //关闭流
            sw.Close();
            fs.Close();
            //System.IO.File.WriteAllText(@"C:\Users\ZY\Desktop\Crawler\Crawler\bin\Debug\config.txt", i.ToString(), Encoding.UTF8);

        }

        private void timer1_Tick(object sender, EventArgs e)
        {
            timer1.Enabled = false;
            if (dt.Rows.Count > counts + 1)
            {
                DataRow dr = dt.Rows[counts];
                string url = dr["URL"].ToString();
                WebClient wc = new WebClient();
                wc.DownloadFile(url, @"c:\mv\" + dr["UID"].ToString() + ".jpg");
                dr["是否下载"] = 1;
                label2.Text = dr["UID"].ToString();
                label4.Text = (counts+1).ToString();
                counts++;
            }
            label6.Text = dt.Rows.Count.ToString();
            label7.Text = i.ToString();
            timer1.Enabled = true;
        }

        private void button2_Click(object sender, EventArgs e)
        {
            t.Enabled = false;//是否执行System.Timers.Timer.Elapsed事件
            timer1.Enabled = false;
        }

        private void button3_Click(object sender, EventArgs e)
        {
            FileStream fs = new FileStream(@"C:\Users\ZY\Desktop\Crawler\Crawler\bin\Debug\config.txt", FileMode.Create);
            StreamWriter sw = new StreamWriter(fs);
            //开始写入
            sw.WriteLine(0);
            sw.WriteLine(0);
            sw.WriteLine(0);
            sw.WriteLine(0);
            //清空缓冲区
            sw.Flush();
            //关闭流
            sw.Close();
            fs.Close();
            i = 0;
            label7.Text = i.ToString();
            label2.Text = i.ToString();
            label4.Text = i.ToString();
            label6.Text = i.ToString();

        }
    }
}