评论

采集案例

58程序采集

using System;
using System.Collections.Generic;
using System.ComponentModel;
using System.Data;
using System.Drawing;
using System.Linq;
using System.Text;
using System.Threading.Tasks;
using System.Windows.Forms;
using Newtonsoft.Json;
using System.Net;
using System.IO;
using System.Text.RegularExpressions;
using System.Security.Cryptography.X509Certificates;
using System.Net.Security;

namespace _58GatherProgress
{
    public partial class Form1 : Form
    {
        public Form1()
        {
            InitializeComponent();
        }

        private void button1_Click(object sender, EventArgs e)
        {
            string url = textBox1.Text;
            if (!string.IsNullOrEmpty(url))
            {
                string result = "";
                try
                {
                    result = Get(url);

                }
                catch (Exception ex)
                {
                    result = ex.Message;
                }
                //MessageBox.Show(result);
                List<InfoModel> list = GetHtml(result);
                //List<InfoModel> list = new List<InfoModel>();
                //list[0].Remark = result;
                ShowData(list);
            }
        }

        private string GetRequest(string _url,string postData)
        {
            ASCIIEncoding encoding = new ASCIIEncoding();
            byte[] data = encoding.GetBytes(postData);
            HttpWebRequest myRequest = (HttpWebRequest)WebRequest.Create(_url);
            myRequest.Accept = "text/html, */*; q=0.01";
            myRequest.Method = "POST";
            myRequest.KeepAlive = true;
            myRequest.Timeout = 10000;
            //myRequest.Date = DateTime.UtcNow;
            myRequest.ContentType = "application/x-www-form-urlencoded; charset=UTF-8";
            myRequest.ContentLength = data.Length;

            myRequest.UserAgent = "Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Mobile Safari/537.36";
            //myRequest.Referer = "https://m.58.com/quanguo/sale.shtml?58hm=m_home_sale_new&58cid=8728&from=home_sale&PGTID=0d100000-0221-84f5-3295-f421ae8e9284&ClickID=2";
            Stream newStream = myRequest.GetRequestStream();
            newStream.Write(data, 0, data.Length);
            newStream.Close();
            HttpWebResponse myResponse = (HttpWebResponse)myRequest.GetResponse();

            StreamReader reader = new StreamReader(myResponse.GetResponseStream(), Encoding.UTF8);
            string result = reader.ReadToEnd();

            return result;
        }


        public static bool CheckValidationResult(object sender, X509Certificate certificate, X509Chain chain, SslPolicyErrors errors)
        {
            //直接确认,否则打不开    
            return true;
        }

        /// <summary>
        /// 处理http GET请求,返回数据
        /// </summary>
        /// <param name="url">请求的url地址</param>
        /// <returns>http GET成功后返回的数据,失败抛WebException异常</returns>
        public string Get(string url)
        {
            string result = "";

            HttpWebRequest request = null;
            HttpWebResponse response = null;

            //请求url以获取数据
            try
            {
                //设置最大连接数
                ServicePointManager.DefaultConnectionLimit = 200;
                //设置https验证方式
                if (url.StartsWith("https", StringComparison.OrdinalIgnoreCase))
                {
                    ServicePointManager.ServerCertificateValidationCallback =
                            new RemoteCertificateValidationCallback(CheckValidationResult);
                    ServicePointManager.SecurityProtocol = SecurityProtocolType.Ssl3 | SecurityProtocolType.Tls | SecurityProtocolType.Tls11 | SecurityProtocolType.Tls12;
                }

                /***************************************************************
                * 下面设置HttpWebRequest的相关属性
                * ************************************************************/
                request = (HttpWebRequest)WebRequest.Create(url);
                request.UserAgent = "Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Mobile Safari/537.36";
                request.Method = "GET";
                request.Headers.Add(HttpRequestHeader.Cookie, "f=n; ABTESTCOOKIEVALUE=1; HISTORY_CATE_IDS=70124%2C246%7C%E4%BA%8C%E6%89%8B%E8%AE%BE%E5%A4%87%7C1%7C246; cookieuid=e4bde6aa-74aa-408a-b2cc-121b786d09c7; device=m; id58=CocI02Pq6N9nzcH1CRH2Ag==; sessionid=55affd17-8ba3-4661-8e8d-19b1dc4d675f");
                //设置代理
                //WebProxy proxy = new WebProxy();
                //proxy.Address = new Uri(WxPayConfig.PROXY_URL);
                //request.Proxy = proxy;

                //获取服务器返回
                response = (HttpWebResponse)request.GetResponse();

                //获取HTTP返回数据
                StreamReader sr = new StreamReader(response.GetResponseStream(), Encoding.UTF8);
                result = sr.ReadToEnd().Trim();
                sr.Close();
            }
            catch (WebException e)
            {
                response = (HttpWebResponse)e.Response;
                //获取HTTP返回数据
                StreamReader sr = new StreamReader(response.GetResponseStream(), Encoding.UTF8);
                result = sr.ReadToEnd().Trim();
                sr.Close();
            }
            catch (Exception e)
            {
                result = "Exception:" + e.Message;
                throw e;
            }
            finally
            {
                //关闭连接和流
                if (response != null)
                {
                    response.Close();
                }
                if (request != null)
                {
                    request.Abort();
                }
            }
            return result;
        }

        /// <summary>
        /// 展示到data框
        /// </summary>
        private void ShowData(List<InfoModel> infolist)
        {
            //
            dataGridView1.DataSource = infolist;
        }


        public List<InfoModel> GetHtml(string infoCont)
        {
            if (!string.IsNullOrEmpty(infoCont))
            {
                //采集内容处理
                string[] values = infoCont.Split(new string[] { "<li class=\"item item_new item-new\"" }, StringSplitOptions.RemoveEmptyEntries);//切割成多组
                textBox2.Text = "共"+values.Length+"条";
                if (values.Length > 0)
                {
                    List<InfoModel> list = new List<InfoModel>();
                    for (int k = 1; k < values.Length; k++)
                    {
                        string insertValues1 = string.Empty; //初始化

                        insertValues1 = values[k].ToString(); //获取其中一组

                        InfoModel model = new InfoModel();
                        // a标签
                        string[] units = insertValues1.Split(new string[] { "<a" }, StringSplitOptions.RemoveEmptyEntries);//切割成多组
                        model.InfoID = getPatternValue(units[1], @"infoid='([0-9]*)");
                        model.LinkMan = getPatternValue(units[1], @"[\u4e00-\u9fa5]+");
                        model.InfoTitle = getPatternValue(units[2], @"<strong (.*)>[\s\S]*<\/strong>");
                        //model.Href = getPatternValue(units[2], @"/https?:\/\/(([a-zA-Z0-9_-])+(\.)?)*(:\d+)?(\/((\.)?(\?)?=?&?[a-zA-Z0-9_-]?[,]?)*)*/gm");
                        model.Href = getPatternValue(units[2], @"href=([""'])?(?<href>[^'""]+)\1[^>]*>");
                        //model.Href = getPatternValue(units[2], "(.)*");
                        model.Area = getPatternValue(units[2], "<span class=\"info-serve-txt\">[\\S]*?<\\/span>");
                        model.Remark = units[2];
                        if (units.Length > 0)
                        {
                            

                            //for (int m = 1; m < units.Length; m++)
                            //{
                            //    model.infoID = m;
                            //    model.remark = units[m];
                            //}
                            list.Add(model);

                        }
                    }
                    return list;
                }

            }
            return null;
        }

        public static string getPatternValue(string content, string pattern)
        {
            string result = "";
            //Regex reg = new Regex(pattern, RegexOptions.Multiline | RegexOptions.IgnoreCase);
            Regex reg = new Regex(pattern, RegexOptions.IgnoreCase | RegexOptions.Multiline);
            //正则匹配多行字符
            MatchCollection mc = reg.Matches(content);
            int i = 0;
            foreach (Match m in mc)
            {
                if (i > 0)
                {
                    result = m.Groups[i].Value.Trim();
                    result += Regex.Replace(Regex.Replace(Regex.Replace(Regex.Replace(Regex.Replace(Regex.Replace(result, "</span>", ""), "<span.*?>", ""), "<b.*?>", ""), "</b>", ""), "<strong.*?>", ""), "</strong>", "").Replace("infoid='","").Replace("\r\n", "");
                }
                else
                {
                    result = m.Groups[0].Value.Trim();
                    result = Regex.Replace(Regex.Replace(Regex.Replace(Regex.Replace(Regex.Replace(Regex.Replace(result, "</span>", ""), "<span.*?>", ""), "<b.*?>", ""), "</b>", ""), "<strong.*?>", ""), "</strong>", "").Replace("infoid='","").Replace("\r\n", "");
                }
                
                i++;
            }
            
            return result.Trim();
        }

        public static string GetRegValue(string content, string pattern, string groupName)
        {
            string result = "";
            if (pattern.IndexOf(groupName) > 0)
            {
                Regex reg = new Regex(pattern, RegexOptions.Multiline | RegexOptions.IgnoreCase);
                //正则匹配多行字符
                MatchCollection mc = reg.Matches(content);
                int i = 0;
                foreach (Match m in mc)
                {
                    if (i > 0)
                        result += "," + Regex.Replace(Regex.Replace(Regex.Replace(Regex.Replace(Regex.Replace(Regex.Replace(Regex.Replace(Regex.Replace(Regex.Replace(m.Groups[groupName].Value.Trim(), "<a.*?>", ""), "</a>", ""), "<span.*?>", ""), "</span>", ""), "&nbsp;", ""), "<font.*?>", ""), "</font>", ""), "<strong.*?>", ""), "</strong>", "");
                    else
                        result = Regex.Replace(Regex.Replace(Regex.Replace(Regex.Replace(Regex.Replace(Regex.Replace(Regex.Replace(Regex.Replace(Regex.Replace(m.Groups[groupName].Value.Trim(), "<a.*?>", ""), "</a>", ""), "<span.*?>", ""), "</span>", ""), "&nbsp;", ""), "<font.*?>", ""), "</font>", ""), "<strong.*?>", ""), "</strong>", "");
                    i++;
                }
            }
            return result;
        }
    }
}

最后一次编辑于  2023-02-14  
点赞 0
收藏
评论
登录 后发表内容