写了个简单的多线程图片爬虫,整理一下。数据已经爬下来了,图片URL需要自行拼接,首先从Lawyers表中取的RawData字段,RawData中有一个list字段是json格式的数据,需要的只是list中的pic和XZQH字段用于拼接图片地址,拼接URL规则如下:
http://www.xxxxx.cn/imagetype/{model.XZQH.Substring(0,2)}00/lsfw/lsuser/{model.pic.Substring(0,model.pic.LastIndexOf('.'))}/{model.pic.Substring(model.pic.LastIndexOf('.') + 1)}
得到图片URL之后就好说了,接下来的就是常规操作download。线程调度的核心思想是四个线程轮流工作,当一个下完或下载失败后,就移除该线程,并重启新线程重复同样的工作。代码如下:
public class Main : HandleProgramBase, IHandleProgram
{
public readonly IUnitOfWork _iUnitOfWork;
public Main(IUnitOfWork iUnitOfWork)
{
_iUnitOfWork = iUnitOfWork;
}
private List<Task> threadManager = new List<Task>();
private static object locker = new object();
private static object counter = new object();
private static ConcurrentQueue<int> counterQueue = new ConcurrentQueue<int>();
private static ReaderWriterLockSlim logWriteLock = new ReaderWriterLockSlim();
private const int total = 150136;
private static int start = 1;
private static int downloadNumber = 0;
public override void Entrance(string[] args)
{
var watcher = new Stopwatch();
watcher.Start();
while (start < total)
{
if (threadManager.Any())
{
Task.WaitAny(threadManager.ToArray());
var completedTaskList = threadManager.Where(a => a.IsCompleted).ToList();
for (var i = 0; i < completedTaskList.Count; i++)
{
if (completedTaskList[i].Status == TaskStatus.Faulted || completedTaskList[i].IsCompleted)
{
threadManager.Remove(completedTaskList[i]);
threadManager.Add(Task.Factory.StartNew(DownloadImg));
continue;
}
completedTaskList[i].Dispose();
}
}
else
{
for (var i = 0; i < 4; i++)
{
threadManager.Add(Task.Factory.StartNew(DownloadImg));
}
}
}
Task.WaitAll(threadManager.ToArray());
watcher.Stop();
Console.WriteLine();
Console.WriteLine("Download Completed.Total time: " + watcher.ElapsedMilliseconds + " ms.");
}
private void DownloadImg()
{
using (var web = new WebClient())
{
var lawyerList = new List<Lawyers>();
lock (locker)
{
if (start != total)
{
var end = start + 50 < total ? start + 50 : total;
lawyerList = _iUnitOfWork.Implement<Lawyers>(string.Format(Resource.GetPagedLawyer, start, end)).ToList();
start = end;
}
}
if (!lawyerList.Any()) return;
foreach (var lawyer in lawyerList)
{
var model = JsonConvert.DeserializeObject<RawData>(lawyer.RawData).list;
var imgUrl =
$"http://www.xxxxx.cn/imagetype/{model.XZQH.Substring(0, 2)}00/lsfw/lsuser/{model.pic.Substring(0, model.pic.LastIndexOf('.'))}/{model.pic.Substring(model.pic.LastIndexOf('.') + 1)}";
var savePath = $@"D:\ImageTestMulti\{lawyer.LawFrimKey}";
if (!Directory.Exists(savePath)) Directory.CreateDirectory(savePath);
for (int j = 0; j < 10; j++)
{
try
{
var position = lawyer.ImageName.LastIndexOf("/", StringComparison.Ordinal);
if (position > 0)
{
//处理 2017-04-19/B748FA5EF1517886AF76A11CDACE5378.png 类文件
var folder = savePath + "\\" + lawyer.ImageName.Substring(0, position).Replace("/", "");
if (!Directory.Exists(folder)) Directory.CreateDirectory(folder);
}
var fileName = savePath + "\\" + lawyer.ImageName.Replace("/", "\\");
if (!File.Exists(fileName)) web.DownloadFile(imgUrl, fileName);
counterQueue.Enqueue(1);
//控制台显示下载数量
lock (counter)
{
Console.WriteLine(Resource.Space);
Console.SetCursorPosition(0, Console.CursorTop - 1);
Console.Write(Resource.DownloadNumber, ++downloadNumber, counterQueue.Count);
}
}
catch (Exception e)
{
Thread.Sleep(1000);
if (j == 9 || ((HttpWebResponse)((WebException)e).Response)?.StatusCode == HttpStatusCode.NotFound)
{
WriteLog($"{lawyer.ImageName}爬取失败! 错误:{e.Message}当前Id:{lawyer.Id}。");
break;
}
continue;
}
break;
}
}
}
}
/// <summary>
/// 写日志
/// </summary>
public static void WriteLog(string errMsg)
{
DateTime dt = DateTime.Now;
string filePathName = System.Diagnostics.Process.GetCurrentProcess().MainModule.FileName;
int pos = filePathName.LastIndexOf("\\");
if (pos != -1)
{
filePathName = filePathName.Substring(0, pos + 1);
filePathName = filePathName + "ErrorLog.txt";
}
StreamWriter sw = null;
try
{
logWriteLock.EnterWriteLock();
if (File.Exists(filePathName))
{
FileInfo mapInfo = new FileInfo(filePathName);
long fileSize = mapInfo.Length;
sw = fileSize > 5 * 1024 * 1024 ? new StreamWriter(filePathName, false) : new StreamWriter(filePathName, true);
}
else
{
sw = new StreamWriter(filePathName, true);
}
sw.WriteLine(dt.ToShortDateString() + " " + dt.ToShortTimeString() + " " + errMsg);
}
catch (Exception ex)
{
Console.WriteLine(ex.Message);
}
finally
{
if (logWriteLock.IsWriteLockHeld)
logWriteLock.ExitWriteLock();
sw?.Close();
}
}
public override string Helper { get; protected set; }
}
几个实体类:
public class Lawyers
{
public int Id { get; set; }
public string LawFrimKey { get; set; }
public string RawData { get; set; }
public string ImageName { get; set; }
}
public class RawData
{
public Lawyer list { get; set; }
}
public class Lawyer
{
public string pic { get; set; }
public string XZQH { get; set; }
}