Skip to content
Snippets Groups Projects
Commit 5081f0e6 authored by linja937's avatar linja937
Browse files

comments added

parent 03d2d6f9
No related branches found
No related tags found
No related merge requests found
......@@ -18,18 +18,18 @@ import java.util.regex.Pattern;
public class WebCrawler {
/*public static void main(String[] args) {
// TODO Auto-generated method stub
}*/
public static void main(String[] args) {
WebCrawler crawler = new WebCrawler();
String rootURL = "https://www.scrapethissite.com";
int nrOfLevels = 8;
int nrOfLinks = 4;
String rootURL = "https://www.google.com";
int nrOfLevels = 3;
int nrOfLinks = 10;
System.out.println("Search started");
crawler.crawl(rootURL,nrOfLinks, nrOfLevels, 1); // 1 is start lvl
// crawler.print();
System.out.println("Done");
}
......@@ -39,6 +39,7 @@ public class WebCrawler {
int nrOfTotLinks = 0;
public WebCrawler() {
......@@ -47,7 +48,8 @@ public class WebCrawler {
}
public String spaceBuilder(int level) {
//This just builds the indentation for the tree structure
public String spaceBuilder(int level) {
String spaceString = "";
for (int i=1;i<level;i++) {
spaceString = spaceString + " ";
......@@ -55,39 +57,32 @@ public class WebCrawler {
return spaceString;
}
public void print() {
//grabs the domain from the link (doesn't work due to a bug I haven't solved yet)
private String urlToDomain(String link) {
// TODO Auto-generated method stub
URL url = null;;
try {
url = new URL(link);
} catch (MalformedURLException e) {
// TODO Auto-generated catch block
return null;
}
String domain = url.getHost();
//while(!visitedURLs.isEmpty()) {
System.out.println(visitedURLs.size());
for (String url : visitedURLs) {
System.out.println(url);
}
/*if(visitedURLs.get(i).charAt(0)=='1') {
System.out.println("Level " + visitedURLs.get(i).charAt(0) + visitedURLs.get(i));
}else if(visitedURLs.get(i).charAt(0)=='1'){
System.out.println(" Level " + visitedURLs.get(i).charAt(0) + visitedURLs.get(i));
}else if(visitedURLs.get(i).charAt(0)=='1') {
System.out.println(" Level " + visitedURLs.get(i).charAt(0) + visitedURLs.get(i));
}*/
//}
}
return domain.startsWith("www.") ? domain.substring(4) : domain.trim();
}
//checks if HTTP response has code 200 OK (it's simple and probably skips some links
// when it doesn't have too but it was the best I could do with minimal bugs)
public boolean checkURL(String link) {
boolean status = false;
try {
URL url = new URL(link);
HttpURLConnection myConnection = (HttpURLConnection) url.openConnection();
if(myConnection.getResponseCode() == 200) {
if((myConnection.getResponseCode() == 200)) {
status = true;
}else {
status = false;
......@@ -108,12 +103,18 @@ public class WebCrawler {
@SuppressWarnings("unused")
//the method that does the actual crawling
public void crawl(String rootURL, int nrOfLinks, int nrOfLevels, int level) {
urlQueue.add(rootURL);
visitedURLs.add(rootURL);
// System.out.println(urlToDomain(rootURL));
visitedURLs.add(urlToDomain(rootURL)); // add to visited URLs
if(!urlQueue.isEmpty()){
if(!urlQueue.isEmpty()){ // only crawl if you have something to crawl
// remove the next url string from the queue to begin traverse.
String s = urlQueue.remove();
......@@ -140,28 +141,25 @@ public class WebCrawler {
}
// create a regex pattern matching a URL
// a regex pattern matching a URL
// that will validate the content of HTML in search of a URL.
String urlPattern = "(http://|https://)+/*[^\s]+[\\w]*/";
Pattern pattern = Pattern.compile(urlPattern);
Matcher matcher = pattern.matcher(rawHTML);
int i = 0;
while(i< nrOfLinks && matcher.find()){
while(i< nrOfLinks && matcher.find()){ // as long as there are URLs in the HTML, keep adding to queue and crawl on that URL
String actualURL = matcher.group();
// System.out.println("Attempt: " + i);
String actualURL = matcher.group(); // the URL to be examined
String domain = urlToDomain(actualURL); // get domain
if(!visitedURLs.contains(domain)){ // this should prevent the crawler to visit same doamin twice
if(checkURL(actualURL)) { //if URL works, print it, crawl it and increase the index so it knows it has found a link on this level
if(!visitedURLs.contains(actualURL)){
//check if link works
// System.out.println(checkURL(actualURL));
if(checkURL(actualURL)) {
nrOfTotLinks++;
// System.out.println(nrOfTotLinks);
// System.out.println(actualURL);
visitedURLs.add(actualURL);
visitedURLs.add(domain);
urlQueue.add(actualURL);
System.out.println(spaceBuilder(level) + "Level " + level +" "+ actualURL);
......@@ -171,7 +169,7 @@ public class WebCrawler {
i++;
if(level<nrOfLevels) {
crawl(actualURL,nrOfLinks,nrOfLevels, level+1);
crawl(actualURL,nrOfLinks,nrOfLevels, level+1); // crawl the link using recursion
}
}
......@@ -181,21 +179,9 @@ public class WebCrawler {
}
}
}
}
// Each time the regex matches a URL in the HTML,
// add it to the queue for the next traverse and the list of visited URLs.
// breakpoint = getBreakpoint(numberOfRes, matcher, level);
// exit the outermost loop if it reaches the breakpoint.
// if(breakpoint == 0){
//if (level==1) {System.out.println("Crawl finished!");
//}
// break;
// }
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment