import java.io.IOException;
import java.io.InputStream;
import java.net.MalformedURLException;
import java.net.URL;
import java.net.URLConnection;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashSet;
import java.util.Scanner;
import java.util.Set;
import java.util.concurrent.ArrayBlockingQueue;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
/**
* This class represents a "web crawler" that crawls the Web looking for
* image URLs. The {@link #addAddressToQueue(URL)} method should be called
* at least once to give the crawler a starting web page. It will download
* that page and search it for image URLs in IMG tags. It puts the image
* URLs into a queue. Images can be obtained from the queue by calling
* {@link #getNextImageURL()} or {@link #getNextImageURLIfAvailable()}.
* The crawler also searches the web page for links in A tags and adds the
* links that it finds to a queue of URLs. URLs are removed continually from
* this queue and are searched in the same way. The crawling is done by
* multiple threads.
*
Note that the public interface to the class is small and simple!
*
Note that the crawler is "throtled" by the fact that the image queue
* is of limited size. After it fills up, the crawling threads will block
* until room is made in the queue. After that, the threads will only look
* for more images as necessary to keep the image queue full.
*
Limitations: The lenght of the URL queue is limited. When it is filled,
* new URLs that are found are simply discarded. This is to avoid a deadlock
* that can occur if all the crawling threads. It might be better to let
* the queue grow to arbitrary size, or at least to let it grow to a very
* large size. The method for determining the base URL of a web page is
* not correct -- it is assumed that the base URL is the URL that was used
* to access the page. This class is NOT meant to be a serious web crawl
* program. It is just a demonstration.
*/
public class ThreadedImageCrawler {
/**
* A delay, in milliseconds, that is inserted by a crawling thread
* after it downloads a web page. This is to help avoid flooding
* the net will a continuous stream of requests.
*/
private final static int NICENESS_DELAY = 500;
/**
* If a page contains more than this number of image links, excess
* links are discarded rather than added to the queue.
*/
private final static int MAX_IMAGES_PER_PAGE = 20;
/**
* Create a crawler that uses a specified number of threads. Uses a URL
* queue of length 250 and an image queue of length 50.
* @param threadPoolSize tnumber of crawling threads to create; must be
* positive
* @throws IllegalArgumentException if the threadPoolSize is 0 or less.
*/
public ThreadedImageCrawler(int threadPoolSize) {
this(threadPoolSize,250,50);
}
/**
* Create a crawler with specified properties.
* @param threadPoolSize the number of crawler threads that will be used.
* @param maximumURLQueueSize the maximum size of the URL queue. When this
* queue fills up, new URLs that are found are thrown away.
* @param maximumImageURLQueueSize the maximum size of the image queue. When
* this queue fills up and a crawler thread wants to add a new image to
* the queue, that thread is blocked. It will block until room becomes
* available in the image queue, as images are removed from the queue.
* @throws IllegalArgumentException if threadPoolSize is less than 1 or
* either of the queue sizes is less than 10.
*/
public ThreadedImageCrawler(int threadPoolSize,
int maximumURLQueueSize, int maximumImageURLQueueSize) {
if (threadPoolSize < 1)
throw new IllegalArgumentException(
"Number of threads must be greater that zero.");
if (maximumURLQueueSize < 10 || maximumImageURLQueueSize < 10)
throw new IllegalArgumentException(
"Maximum queue size must be at least 10");
threadPool = new CrawlThread[threadPoolSize];
foundURLs = Collections.synchronizedSet(new HashSet());
urlQueue = new ArrayBlockingQueue(maximumURLQueueSize);
imageURLQueue = new ArrayBlockingQueue(maximumImageURLQueueSize);
for (int i = 0; i < threadPoolSize; i++) {
threadPool[i] = new CrawlThread(i);
threadPool[i].start();
}
}
/**
* Add an address to the URL queue. This URL will be used as a
* starting point for crawling the web. If the URL queue is full,
* this method will block until room becomes available.
*/
public void addAddressToQueue(URL url) {
while (true) {
try {
urlQueue.put(url);
return;
}
catch (InterruptedException e) {
}
}
}
/**
* Get the next available image from the image queue. If the
* queue is empty, this method returns null. It does not block.
*/
public URL getNextImageURLIfAvailable() {
return imageURLQueue.poll();
}
/**
* Get the next available image from the image queue. If the
* queue is empty, this method will block until an image becomes
* available.
*/
public URL getNextImageURL() {
while (true) {
try {
return imageURLQueue.take();
}
catch (InterruptedException e) {
}
}
}
//------------- the rest of the class is the private implementation ---------
/*
* Three patterns to use for searching web pages for links.
*/
private final static Pattern webLinkPattern = Pattern.compile(
"]*href\\s*=\\s*(\"[^\"]+\"|'[^']+')",
Pattern.CASE_INSENSITIVE);
private final static Pattern frameLinkPattern = Pattern.compile(
"]*src\\s*=\\s*(\"[^\"]+\"|'[^']+')",
Pattern.CASE_INSENSITIVE);
private final static Pattern imageLinkPattern = Pattern.compile(
"]*src\\s*=\\s*(\"[^\"]+\"|'[^']+')",
Pattern.CASE_INSENSITIVE);
private Set foundURLs; // Use to avoid visiting duplicate URLs
private ArrayBlockingQueue urlQueue; // the crawler's URL queue
private ArrayBlockingQueue imageURLQueue; // the image queue
private CrawlThread[] threadPool; // the crawling threads
/**
* A subclass of Thread that defines the threads that crawl the web.
*/
private class CrawlThread extends Thread {
private int id; // An id, used only in log messages
CrawlThread(int id) {
setDaemon(true);
setPriority(getPriority() - 1);
this.id = id;
}
/**
* Send a message to a log. Does nothing, since the output command in
* this method has been commented out. Uncomment it to see log messages.
*/
private void log(String message) {
// System.out.println("Message from " + id + ": " + message);
}
/*
* The run method executes an infinite loop in which a URL is
* removed from the URL queue and processed. Processing consists
* of connection to the URL and searaching the HTML page for
* links to images and to other web pages, if possible. Image
* links are added to the image queue. Other links are "offered"
* to the URL queue.
*/
public void run() {
ArrayList linkURLs = new ArrayList();
ArrayList imageURLs = new ArrayList();
while (true) {
URL url;
try {
url = urlQueue.take(); // get URL from queue; can block.
}
catch (InterruptedException e) {
continue; // back to start of while loop; probably can't happen
}
InputStream pageContents = getConnection(url,imageURLs);
if (pageContents == null)
continue; // no HTML page; go back to start of while loop
getURLs(pageContents, url, linkURLs, imageURLs);
int imageCt = 0;
for (URL imageAddress : imageURLs) {
if (foundURLs.add(imageAddress)) {
try {
imageURLQueue.put(imageAddress);
log("Added " + imageAddress + " to the image URL queue.");
imageCt++;
if (imageCt > MAX_IMAGES_PER_PAGE)
break;
}
catch (InterruptedException e) {
}
}
log("Added " + imageAddress + " to the Image queue");
}
for (URL linkAddress : linkURLs) {
if (foundURLs.add(linkAddress)) {
if (urlQueue.offer(linkAddress) == false)
break;
}
}
linkURLs.clear();
imageURLs.clear();
try {
Thread.sleep(NICENESS_DELAY);
}
catch (InterruptedException e) {
}
}
}
/**
* Open a connection to a URL. If the connection succeeds and the
* content type of the resource is html, then an input stream is
* opened for reading the contents. If the content is an image,
* then the url is added to the list of imageURLs so that it can
* be added to the image queue.
*/
private InputStream getConnection(URL url, ArrayList imageURLs) {
InputStream pageContents;
URLConnection connection;
try {
connection = url.openConnection();
pageContents = connection.getInputStream();
}
catch (Exception e) {
log("Can't connect to " + url);
return null;
}
String contentType = connection.getContentType();
if (contentType != null && (contentType.startsWith("text/html") ||
contentType.startsWith("application/xhtml+xml"))) {
return pageContents;
}
if (contentType == null)
log("Can't figure out the data type of " + url);
else if (contentType.startsWith("image/"))
imageURLs.add(url);
else
log("Can't handle content type " + contentType
+ " at " + url);
try {
pageContents.close();
}
catch (IOException e) {
}
return null;
}
/**
* Reads from an imput stream, which will be HTML text, and
* looks for image links and links to other web pages. The
* found URLs are added to the appropriate ArrayList.
*/
private void getURLs(InputStream source, URL parentURL,
ArrayList linkURLs, ArrayList imageURLs) {
Scanner in = new Scanner(source);
try {
while (in.hasNextLine()) {
String line = in.nextLine();
Matcher matcher;
matcher = webLinkPattern.matcher(line);
while ( matcher.find() ) { // found a URL in an "a" tag
String address = matcher.group(1);
address = address.substring(1,address.length()-1);
try {
linkURLs.add(new URL(parentURL,address));
}
catch (MalformedURLException e) { // Bad URL is ignored.
}
}
matcher = frameLinkPattern.matcher(line);
while ( matcher.find() ) { // found a URL in a "frame" tag
String address = matcher.group(1);
address = address.substring(1,address.length()-1);
try {
linkURLs.add(new URL(parentURL,address));
}
catch (MalformedURLException e) {
}
}
matcher = imageLinkPattern.matcher(line);
while ( matcher.find() ) { // found a URL in a "frame" tag
String address = matcher.group(1);
address = address.substring(1,address.length()-1);
try {
imageURLs.add(new URL(parentURL,address));
}
catch (MalformedURLException e) {
}
}
}
}
catch (Exception e) {
}
finally {
try {
in.close(); // close the stream, ignoring any exception
}
catch (Exception e) {
}
}
}
} // end nested class CrawlThread
}