Skip to content

Commit f0b84e3

Browse files
authored
Merge pull request #68 from TurnerSoftware/fix-max-pages-allowed-bug
Fix max pages allowed bug
2 parents 4f51b11 + 76d0bf0 commit f0b84e3

6 files changed

Lines changed: 57 additions & 46 deletions

File tree

src/InfinityCrawler/Crawler.cs

Lines changed: 5 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -63,13 +63,12 @@ public async Task<CrawlResult> Crawl(Uri siteUri, CrawlSettings settings)
6363

6464
result.CrawledUris = await crawlRunner.ProcessAsync(async (requestResult, crawlState) =>
6565
{
66-
var response = requestResult.ResponseMessage;
67-
using (var contentStream = await response.Content.ReadAsStreamAsync())
66+
using (requestResult.Content)
6867
{
69-
var headers = new CrawlHeaders(response.Headers, response.Content.Headers);
70-
var content = settings.ContentProcessor.Parse(crawlState.Location, headers, contentStream);
71-
contentStream.Seek(0, SeekOrigin.Begin);
72-
content.RawContent = await new StreamReader(contentStream).ReadToEndAsync();
68+
var headers = new CrawlHeaders(requestResult.ResponseHeaders, requestResult.ContentHeaders);
69+
var content = settings.ContentProcessor.Parse(crawlState.Location, headers, requestResult.Content);
70+
requestResult.Content.Seek(0, SeekOrigin.Begin);
71+
content.RawContent = await new StreamReader(requestResult.Content).ReadToEndAsync();
7372
crawlRunner.AddResult(crawlState.Location, content);
7473
}
7574
});

src/InfinityCrawler/Internal/CrawlRunner.cs

Lines changed: 14 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -50,7 +50,7 @@ private Uri StripFragment(Uri uri)
5050
}.Uri;
5151
}
5252

53-
public void AddLink(CrawlLink crawlLink)
53+
private void AddLink(CrawlLink crawlLink)
5454
{
5555
if (crawlLink.Relationship != null && crawlLink.Relationship.Equals("nofollow", StringComparison.InvariantCultureIgnoreCase))
5656
{
@@ -109,14 +109,6 @@ public void AddResult(Uri requestUri, CrawledContent content)
109109
{
110110
Logger?.LogDebug($"Result for {requestUri} has completed successfully with content.");
111111

112-
if (robotsPageDefinition.CanFollowLinks(Settings.UserAgent))
113-
{
114-
foreach (var crawlLink in content.Links)
115-
{
116-
AddLink(crawlLink);
117-
}
118-
}
119-
120112
AddResult(new CrawledUri
121113
{
122114
Location = crawlState.Location,
@@ -125,6 +117,14 @@ public void AddResult(Uri requestUri, CrawledContent content)
125117
Requests = crawlState.Requests,
126118
Content = content
127119
});
120+
121+
if (robotsPageDefinition.CanFollowLinks(Settings.UserAgent))
122+
{
123+
foreach (var crawlLink in content.Links)
124+
{
125+
AddLink(crawlLink);
126+
}
127+
}
128128
}
129129
}
130130
}
@@ -232,7 +232,7 @@ await Settings.RequestProcessor.ProcessAsync(
232232
Location = requestResult.RequestUri
233233
});
234234

235-
if (requestResult.ResponseMessage == null)
235+
if (requestResult.Exception != null)
236236
{
237237
//Retry failed requests
238238
Logger?.LogDebug($"An exception occurred while requesting {crawlState.Location}. This URL will be added to the request queue to be attempted again later.");
@@ -245,14 +245,12 @@ await Settings.RequestProcessor.ProcessAsync(
245245
}
246246
else
247247
{
248-
var response = requestResult.ResponseMessage;
249-
250248
var crawlRequest = new CrawlRequest
251249
{
252250
RequestStart = requestResult.RequestStart,
253251
ElapsedTime = requestResult.ElapsedTime,
254-
StatusCode = response.StatusCode,
255-
IsSuccessfulStatus = response.IsSuccessStatusCode
252+
StatusCode = requestResult.StatusCode,
253+
IsSuccessfulStatus = (int)requestResult.StatusCode is >= 200 and <= 299
256254
};
257255
crawlState.Requests.Add(crawlRequest);
258256

@@ -264,8 +262,8 @@ await Settings.RequestProcessor.ProcessAsync(
264262
};
265263
if (redirectStatusCodes.Contains(crawlRequest.StatusCode.Value))
266264
{
267-
Logger?.LogDebug($"Result for {crawlState.Location} was a redirect ({response.Headers.Location}). This URL will be added to the request queue.");
268-
AddRedirect(crawlState.Location, response.Headers.Location);
265+
Logger?.LogDebug($"Result for {crawlState.Location} was a redirect ({requestResult.ResponseHeaders.Location}). This URL will be added to the request queue.");
266+
AddRedirect(crawlState.Location, requestResult.ResponseHeaders.Location);
269267
}
270268
else if (crawlRequest.IsSuccessfulStatus)
271269
{

src/InfinityCrawler/Processing/Requests/DefaultRequestProcessor.cs

Lines changed: 19 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,7 @@ public async Task ProcessAsync(HttpClient httpClient, Func<RequestResult, Task>
3939
}
4040

4141
var random = new Random();
42-
var activeRequests = new ConcurrentDictionary<Task, RequestContext>(options.MaxNumberOfSimultaneousRequests, options.MaxNumberOfSimultaneousRequests);
42+
var activeRequests = new ConcurrentDictionary<Task<RequestResult>, RequestContext>(options.MaxNumberOfSimultaneousRequests, options.MaxNumberOfSimultaneousRequests);
4343

4444
var currentBackoff = 0;
4545
var successesSinceLastThrottle = 0;
@@ -77,7 +77,7 @@ public async Task ProcessAsync(HttpClient httpClient, Func<RequestResult, Task>
7777

7878
Logger?.LogDebug($"Request #{requestContext.RequestNumber} ({requestUri}) starting with a {requestStartDelay}ms delay.");
7979

80-
var task = PerformRequestAsync(httpClient, responseAction, requestContext);
80+
var task = PerformRequestAsync(httpClient, requestContext);
8181

8282
activeRequests.TryAdd(task, requestContext);
8383
requestCount++;
@@ -91,6 +91,8 @@ public async Task ProcessAsync(HttpClient httpClient, Func<RequestResult, Task>
9191

9292
await Task.WhenAny(activeRequests.Keys).ConfigureAwait(false);
9393

94+
cancellationToken.ThrowIfCancellationRequested();
95+
9496
var completedRequests = activeRequests.Keys.Where(t => t.IsCompleted);
9597
foreach (var completedRequest in completedRequests)
9698
{
@@ -105,6 +107,8 @@ public async Task ProcessAsync(HttpClient httpClient, Func<RequestResult, Task>
105107
ExceptionDispatchInfo.Capture(aggregateException.InnerException).Throw();
106108
}
107109

110+
await responseAction(completedRequest.Result);
111+
108112
//Manage the throttling based on timeouts and successes
109113
var throttlePoint = options.TimeoutBeforeThrottle;
110114
if (throttlePoint.TotalMilliseconds > 0 && requestContext.Timer.Elapsed > throttlePoint)
@@ -130,7 +134,7 @@ public async Task ProcessAsync(HttpClient httpClient, Func<RequestResult, Task>
130134
Logger?.LogDebug($"Completed processing {requestCount} requests.");
131135
}
132136

133-
private async Task PerformRequestAsync(HttpClient httpClient, Func<RequestResult, Task> responseAction, RequestContext context)
137+
private async Task<RequestResult> PerformRequestAsync(HttpClient httpClient, RequestContext context)
134138
{
135139
if (context.RequestStartDelay > 0)
136140
{
@@ -146,7 +150,9 @@ private async Task PerformRequestAsync(HttpClient httpClient, Func<RequestResult
146150
var combinedToken = CancellationTokenSource.CreateLinkedTokenSource(context.CancellationToken, timeoutToken).Token;
147151
using (var response = await httpClient.GetAsync(context.RequestUri, combinedToken))
148152
{
149-
await response.Content.LoadIntoBufferAsync();
153+
var contentStream = new MemoryStream();
154+
await response.Content.CopyToAsync(contentStream);
155+
contentStream.Seek(0, SeekOrigin.Begin);
150156

151157
//We only want to time the request, not the handling of the response
152158
context.Timer.Stop();
@@ -155,19 +161,23 @@ private async Task PerformRequestAsync(HttpClient httpClient, Func<RequestResult
155161

156162
Logger?.LogDebug($"Request #{context.RequestNumber} completed successfully in {context.Timer.ElapsedMilliseconds}ms.");
157163

158-
await responseAction(new RequestResult
164+
return new RequestResult
159165
{
160166
RequestUri = context.RequestUri,
161167
RequestStart = requestStart,
162168
RequestStartDelay = context.RequestStartDelay,
163-
ResponseMessage = response,
169+
StatusCode = response.StatusCode,
170+
ResponseHeaders = response.Headers,
171+
ContentHeaders = response.Content.Headers,
172+
Content = contentStream,
164173
ElapsedTime = context.Timer.Elapsed
165-
});
174+
};
166175
}
167176
}
168177
catch (OperationCanceledException) when (context.CancellationToken.IsCancellationRequested)
169178
{
170179
Logger?.LogDebug($"Request #{context.RequestNumber} cancelled.");
180+
return null;
171181
}
172182
catch (Exception ex) when (ex is HttpRequestException || ex is OperationCanceledException)
173183
{
@@ -176,14 +186,14 @@ await responseAction(new RequestResult
176186
Logger?.LogDebug($"Request #{context.RequestNumber} completed with error in {context.Timer.ElapsedMilliseconds}ms.");
177187
Logger?.LogTrace(ex, $"Request #{context.RequestNumber} Exception: {ex.Message}");
178188

179-
await responseAction(new RequestResult
189+
return new RequestResult
180190
{
181191
RequestUri = context.RequestUri,
182192
RequestStart = requestStart,
183193
RequestStartDelay = context.RequestStartDelay,
184194
ElapsedTime = context.Timer.Elapsed,
185195
Exception = ex
186-
});
196+
};
187197
}
188198
}
189199
}

src/InfinityCrawler/Processing/Requests/RequestResult.cs

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,9 @@
11
using System;
22
using System.Collections.Generic;
3+
using System.IO;
4+
using System.Net;
35
using System.Net.Http;
6+
using System.Net.Http.Headers;
47
using System.Text;
58

69
namespace InfinityCrawler.Processing.Requests
@@ -10,7 +13,10 @@ public class RequestResult
1013
public Uri RequestUri { get; set; }
1114
public DateTime RequestStart { get; set; }
1215
public double RequestStartDelay { get; set; }
13-
public HttpResponseMessage ResponseMessage { get; set; }
16+
public HttpStatusCode? StatusCode { get; set; }
17+
public HttpResponseHeaders ResponseHeaders { get; set; }
18+
public HttpContentHeaders ContentHeaders { get; set; }
19+
public Stream Content { get; set; }
1420
public TimeSpan ElapsedTime { get; set; }
1521
public Exception Exception { get; set; }
1622
}

tests/InfinityCrawler.Tests/BasicSiteTests.cs

Lines changed: 12 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -141,9 +141,11 @@ public async Task MaximumRedirectLimitFollowed()
141141
Assert.AreEqual(CrawlStatus.MaxRedirects, crawledUri.Status);
142142
Assert.AreEqual(3, crawledUri.RedirectChain.Count);
143143
}
144-
145-
[TestMethod]
146-
public async Task MaximumPagesCrawledFollowed()
144+
145+
[DataRow(2)]
146+
[DataRow(4)]
147+
[DataTestMethod]
148+
public async Task MaximumPagesCrawledFollowed(int maxPages)
147149
{
148150
var crawler = GetTestSiteCrawler(new SiteContext
149151
{
@@ -155,15 +157,11 @@ public async Task MaximumPagesCrawledFollowed()
155157
RequestProcessorOptions = GetNoDelayRequestProcessorOptions()
156158
};
157159

158-
settings.MaxNumberOfPagesToCrawl = 4;
160+
settings.MaxNumberOfPagesToCrawl = maxPages;
159161
var result = await crawler.Crawl(new Uri("http://localhost/"), settings);
160-
Assert.AreEqual(4, result.CrawledUris.Count());
161-
162-
settings.MaxNumberOfPagesToCrawl = 2;
163-
result = await crawler.Crawl(new Uri("http://localhost/"), settings);
164-
Assert.AreEqual(2, result.CrawledUris.Count());
162+
Assert.AreEqual(maxPages, result.CrawledUris.Count());
165163
}
166-
164+
167165
[TestMethod]
168166
public async Task AutoRetryOnFailure()
169167
{
@@ -185,10 +183,10 @@ public async Task AutoRetryOnFailure()
185183
}
186184
};
187185

188-
settings.RequestProcessor.Add(new Uri("http://localhost/delay/300/300ms-delay-1"));
189-
settings.RequestProcessor.Add(new Uri("http://localhost/delay/300/300ms-delay-2"));
190-
settings.RequestProcessor.Add(new Uri("http://localhost/delay/300/300ms-delay-3"));
191-
settings.RequestProcessor.Add(new Uri("http://localhost/delay/300/300ms-delay-4"));
186+
settings.RequestProcessor.Add(new Uri("http://localhost/delay/500/500ms-delay-1"));
187+
settings.RequestProcessor.Add(new Uri("http://localhost/delay/500/500ms-delay-2"));
188+
settings.RequestProcessor.Add(new Uri("http://localhost/delay/500/500ms-delay-3"));
189+
settings.RequestProcessor.Add(new Uri("http://localhost/delay/500/500ms-delay-4"));
192190

193191
var results = await crawler.Crawl(new Uri("http://localhost/"), settings);
194192
var delayedCrawls = results.CrawledUris.Where(c => c.Location.PathAndQuery.Contains("delay")).ToArray();

0 commit comments

Comments
 (0)