Hello.
Sorry about my English.
I have job , download and parse data from about 30000 web pages.
public override IEnumerable<DIRTY_SCHEDULE> Fetch(string ctx, string url = "") {
var dom = DocumentBuilder.Html(ctx);
//不支持 even
//var trs = dom.QuerySelectorAll("#accordion2 table tbody tr:even");
var trs = dom.QuerySelectorAll("#accordion2>.accordion-group>.accordion-heading>table>tbody>tr");
for (var i = 0; i < trs.Length; i = i + 2) {
var tr = trs[i];
var tds = tr.QuerySelectorAll("td");
var entry = new DIRTY_SCHEDULE {
CARRIER = tds[0].Text(),
ROUTE = tds[1].Text().Trim(),
VESSEL = tds[2].Text().Trim(),
VOYAGE = tds[3].Text().Trim(),
ORGIN = tds[4].Text().Trim(),
ETD = tds[5].Text().Trim().ToDateTime("yyyy-MM-dd", DateTime.Now),
DEST = tds[6].Text().Trim(),
ETA = tds[7].Text().Trim().ToDateTime("yyyy-MM-dd", DateTime.Now),
TT = tds[8].Text().Trim().ToDecimalOrNull(),
DIRTY_SCHEDULE_TRANSF = this.FetchTransf(trs[i + 1]).ToList(),
SOURCE = url,
APP = "Fetcher.Soushipping",
};
entry.UNQTAG = entry.GetUNQTag();
yield return entry;
}
}
private IEnumerable<DIRTY_SCHEDULE_TRANSF> FetchTransf(IElement tr) {
var tbls = tr.QuerySelectorAll("table.widget");
//第一个列出的是起始地
for (var i = 1; i < tbls.Length; i++) {
var rows = tbls[i].QuerySelectorAll("tr");
if (rows.Length == 3)
yield return new DIRTY_SCHEDULE_TRANSF {
VESSEL = rows[0].Text().Trim(),
AT = rows[1].QuerySelector("td").Text().Trim(), //rows[1].FirstChild.Text().Trim(),
VOYAGE = rows[2].Text().Trim(),
SEQ = i - 1
};
}
}
It work fine.
But, Memory leaked very serious.
public override IEnumerable<DIRTY_SCHEDULE> Fetch(string ctx, string url = "") {
var doc = new HtmlDocument();
doc.LoadHtml2(ctx);
var root = doc.DocumentNode;
var trs = root.QuerySelectorAll("#accordion2>.accordion-group>.accordion-heading>table>tbody>tr")
.ToList();
for (var i = 0; i < trs.Count(); i = i + 2) {
var tr = trs[i];
var tds = tr.QuerySelectorAll("td").ToList();
var entry = new DIRTY_SCHEDULE {
CARRIER = tds[0].InnerText.Clear(),
ROUTE = tds[1].InnerText.Clear(),
VESSEL = tds[2].InnerText.Clear(),
VOYAGE = tds[3].InnerText.Clear(),
ORGIN = tds[4].InnerText.Clear(),
ETD = tds[5].InnerText.Clear().ToDateTime("yyyy-MM-dd", DateTime.Now),
DEST = tds[6].InnerText.Clear(),
ETA = tds[7].InnerText.Clear().ToDateTime("yyyy-MM-dd", DateTime.Now),
TT = tds[8].InnerText.Clear().ToDecimalOrNull(),
DIRTY_SCHEDULE_TRANSF = this.FetchTransf(trs[i + 1]).ToList(),
SOURCE = url,
APP = "Fetcher.Soushipping",
};
entry.UNQTAG = entry.GetUNQTag();
yield return entry;
}
}
private IEnumerable<DIRTY_SCHEDULE_TRANSF> FetchTransf(HtmlNode tr) {
var tbls = tr.QuerySelectorAll("table.widget").ToList();
//第一个列出的是起始地
for (var i = 1; i < tbls.Count(); i++) {
var rows = tbls[i].QuerySelectorAll("tr").ToList();
if (rows.Count == 3)
yield return new DIRTY_SCHEDULE_TRANSF {
VESSEL = rows[0].InnerText.Clear(),
AT = rows[1].QuerySelector("td").InnerText.Clear(), //rows[1].FirstChild.Text().Trim(),
VOYAGE = rows[2].InnerText.Clear(),
SEQ = i - 1
};
}
}