diff --git a/WebCrawler.java b/WebCrawler.java index 114271e26db4cd8d9445b59148166f0718415dea..839b3fb824c86419d4fdd8d376fd63575fea8704 100644 --- a/WebCrawler.java +++ b/WebCrawler.java @@ -18,18 +18,18 @@ import java.util.regex.Pattern; public class WebCrawler { - /*public static void main(String[] args) { - // TODO Auto-generated method stub - }*/ public static void main(String[] args) { WebCrawler crawler = new WebCrawler(); - String rootURL = "https://www.scrapethissite.com"; - int nrOfLevels = 8; - int nrOfLinks = 4; + String rootURL = "https://www.google.com"; + int nrOfLevels = 3; + int nrOfLinks = 10; + System.out.println("Search started"); crawler.crawl(rootURL,nrOfLinks, nrOfLevels, 1); // 1 is start lvl - // crawler.print(); + + System.out.println("Done"); + } @@ -39,6 +39,7 @@ public class WebCrawler { int nrOfTotLinks = 0; + public WebCrawler() { @@ -47,7 +48,8 @@ public class WebCrawler { } - public String spaceBuilder(int level) { + //This just builds the indentation for the tree structure + public String spaceBuilder(int level) { String spaceString = ""; for (int i=1;i<level;i++) { spaceString = spaceString + " "; @@ -55,39 +57,32 @@ public class WebCrawler { return spaceString; } - - public void print() { + //grabs the domain from the link (doesn't work due to a bug I haven't solved yet) + private String urlToDomain(String link) { + // TODO Auto-generated method stub + URL url = null;; + try { + url = new URL(link); + } catch (MalformedURLException e) { + // TODO Auto-generated catch block + return null; + } + String domain = url.getHost(); - //while(!visitedURLs.isEmpty()) { - System.out.println(visitedURLs.size()); - for (String url : visitedURLs) { - - System.out.println(url); - } - /*if(visitedURLs.get(i).charAt(0)=='1') { - System.out.println("Level " + visitedURLs.get(i).charAt(0) + visitedURLs.get(i)); - }else if(visitedURLs.get(i).charAt(0)=='1'){ - System.out.println(" Level " + visitedURLs.get(i).charAt(0) + visitedURLs.get(i)); - }else if(visitedURLs.get(i).charAt(0)=='1') { - System.out.println(" Level " + visitedURLs.get(i).charAt(0) + visitedURLs.get(i)); - }*/ - - - - - - - //} - - } + return domain.startsWith("www.") ? domain.substring(4) : domain.trim(); + + } + //checks if HTTP response has code 200 OK (it's simple and probably skips some links + // when it doesn't have too but it was the best I could do with minimal bugs) public boolean checkURL(String link) { boolean status = false; try { URL url = new URL(link); + HttpURLConnection myConnection = (HttpURLConnection) url.openConnection(); - if(myConnection.getResponseCode() == 200) { + if((myConnection.getResponseCode() == 200)) { status = true; }else { status = false; @@ -108,12 +103,18 @@ public class WebCrawler { - @SuppressWarnings("unused") + + + + + //the method that does the actual crawling public void crawl(String rootURL, int nrOfLinks, int nrOfLevels, int level) { urlQueue.add(rootURL); - visitedURLs.add(rootURL); + + // System.out.println(urlToDomain(rootURL)); + visitedURLs.add(urlToDomain(rootURL)); // add to visited URLs - if(!urlQueue.isEmpty()){ + if(!urlQueue.isEmpty()){ // only crawl if you have something to crawl // remove the next url string from the queue to begin traverse. String s = urlQueue.remove(); @@ -140,28 +141,25 @@ public class WebCrawler { } - // create a regex pattern matching a URL + // a regex pattern matching a URL // that will validate the content of HTML in search of a URL. String urlPattern = "(http://|https://)+/*[^\s]+[\\w]*/"; Pattern pattern = Pattern.compile(urlPattern); Matcher matcher = pattern.matcher(rawHTML); int i = 0; - while(i< nrOfLinks && matcher.find()){ + while(i< nrOfLinks && matcher.find()){ // as long as there are URLs in the HTML, keep adding to queue and crawl on that URL - String actualURL = matcher.group(); - // System.out.println("Attempt: " + i); + String actualURL = matcher.group(); // the URL to be examined + + String domain = urlToDomain(actualURL); // get domain + if(!visitedURLs.contains(domain)){ // this should prevent the crawler to visit same doamin twice + + + if(checkURL(actualURL)) { //if URL works, print it, crawl it and increase the index so it knows it has found a link on this level - if(!visitedURLs.contains(actualURL)){ - //check if link works - // System.out.println(checkURL(actualURL)); - if(checkURL(actualURL)) { - nrOfTotLinks++; - // System.out.println(nrOfTotLinks); - // System.out.println(actualURL); - visitedURLs.add(actualURL); + visitedURLs.add(domain); urlQueue.add(actualURL); - System.out.println(spaceBuilder(level) + "Level " + level +" "+ actualURL); @@ -171,7 +169,7 @@ public class WebCrawler { i++; if(level<nrOfLevels) { - crawl(actualURL,nrOfLinks,nrOfLevels, level+1); + crawl(actualURL,nrOfLinks,nrOfLevels, level+1); // crawl the link using recursion } } @@ -181,21 +179,9 @@ public class WebCrawler { } } } + + } - - - // Each time the regex matches a URL in the HTML, - // add it to the queue for the next traverse and the list of visited URLs. - // breakpoint = getBreakpoint(numberOfRes, matcher, level); - // exit the outermost loop if it reaches the breakpoint. - // if(breakpoint == 0){ - - //if (level==1) {System.out.println("Crawl finished!"); - - //} - // break; - // } -