Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
T
TDTS11
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Requirements
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Locked files
Build
Pipelines
Jobs
Pipeline schedules
Test cases
Artifacts
Deploy
Releases
Package registry
Model registry
Operate
Environments
Terraform modules
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Code review analytics
Issue analytics
Insights
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Terms and privacy
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
Linus Janzon
TDTS11
Commits
5081f0e6
Commit
5081f0e6
authored
2 years ago
by
linja937
Browse files
Options
Downloads
Patches
Plain Diff
comments added
parent
03d2d6f9
No related branches found
No related tags found
No related merge requests found
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
WebCrawler.java
+50
-64
50 additions, 64 deletions
WebCrawler.java
with
50 additions
and
64 deletions
WebCrawler.java
+
50
−
64
View file @
5081f0e6
...
...
@@ -18,18 +18,18 @@ import java.util.regex.Pattern;
public
class
WebCrawler
{
/*public static void main(String[] args) {
// TODO Auto-generated method stub
}*/
public
static
void
main
(
String
[]
args
)
{
WebCrawler
crawler
=
new
WebCrawler
();
String
rootURL
=
"https://www.scrapethissite.com"
;
int
nrOfLevels
=
8
;
int
nrOfLinks
=
4
;
String
rootURL
=
"https://www.google.com"
;
int
nrOfLevels
=
3
;
int
nrOfLinks
=
10
;
System
.
out
.
println
(
"Search started"
);
crawler
.
crawl
(
rootURL
,
nrOfLinks
,
nrOfLevels
,
1
);
// 1 is start lvl
// crawler.print();
System
.
out
.
println
(
"Done"
);
}
...
...
@@ -39,6 +39,7 @@ public class WebCrawler {
int
nrOfTotLinks
=
0
;
public
WebCrawler
()
{
...
...
@@ -47,7 +48,8 @@ public class WebCrawler {
}
public
String
spaceBuilder
(
int
level
)
{
//This just builds the indentation for the tree structure
public
String
spaceBuilder
(
int
level
)
{
String
spaceString
=
""
;
for
(
int
i
=
1
;
i
<
level
;
i
++)
{
spaceString
=
spaceString
+
" "
;
...
...
@@ -55,39 +57,32 @@ public class WebCrawler {
return
spaceString
;
}
public
void
print
()
{
//grabs the domain from the link (doesn't work due to a bug I haven't solved yet)
private
String
urlToDomain
(
String
link
)
{
// TODO Auto-generated method stub
URL
url
=
null
;;
try
{
url
=
new
URL
(
link
);
}
catch
(
MalformedURLException
e
)
{
// TODO Auto-generated catch block
return
null
;
}
String
domain
=
url
.
getHost
();
//while(!visitedURLs.isEmpty()) {
System
.
out
.
println
(
visitedURLs
.
size
());
for
(
String
url
:
visitedURLs
)
{
System
.
out
.
println
(
url
);
}
/*if(visitedURLs.get(i).charAt(0)=='1') {
System.out.println("Level " + visitedURLs.get(i).charAt(0) + visitedURLs.get(i));
}else if(visitedURLs.get(i).charAt(0)=='1'){
System.out.println(" Level " + visitedURLs.get(i).charAt(0) + visitedURLs.get(i));
}else if(visitedURLs.get(i).charAt(0)=='1') {
System.out.println(" Level " + visitedURLs.get(i).charAt(0) + visitedURLs.get(i));
}*/
//}
}
return
domain
.
startsWith
(
"www."
)
?
domain
.
substring
(
4
)
:
domain
.
trim
();
}
//checks if HTTP response has code 200 OK (it's simple and probably skips some links
// when it doesn't have too but it was the best I could do with minimal bugs)
public
boolean
checkURL
(
String
link
)
{
boolean
status
=
false
;
try
{
URL
url
=
new
URL
(
link
);
HttpURLConnection
myConnection
=
(
HttpURLConnection
)
url
.
openConnection
();
if
(
myConnection
.
getResponseCode
()
==
200
)
{
if
(
(
myConnection
.
getResponseCode
()
==
200
)
)
{
status
=
true
;
}
else
{
status
=
false
;
...
...
@@ -108,12 +103,18 @@ public class WebCrawler {
@SuppressWarnings
(
"unused"
)
//the method that does the actual crawling
public
void
crawl
(
String
rootURL
,
int
nrOfLinks
,
int
nrOfLevels
,
int
level
)
{
urlQueue
.
add
(
rootURL
);
visitedURLs
.
add
(
rootURL
);
// System.out.println(urlToDomain(rootURL));
visitedURLs
.
add
(
urlToDomain
(
rootURL
));
// add to visited URLs
if
(!
urlQueue
.
isEmpty
()){
if
(!
urlQueue
.
isEmpty
()){
// only crawl if you have something to crawl
// remove the next url string from the queue to begin traverse.
String
s
=
urlQueue
.
remove
();
...
...
@@ -140,28 +141,25 @@ public class WebCrawler {
}
//
create
a regex pattern matching a URL
// a regex pattern matching a URL
// that will validate the content of HTML in search of a URL.
String
urlPattern
=
"(http://|https://)+/*[^\s]+[\\w]*/"
;
Pattern
pattern
=
Pattern
.
compile
(
urlPattern
);
Matcher
matcher
=
pattern
.
matcher
(
rawHTML
);
int
i
=
0
;
while
(
i
<
nrOfLinks
&&
matcher
.
find
()){
while
(
i
<
nrOfLinks
&&
matcher
.
find
()){
// as long as there are URLs in the HTML, keep adding to queue and crawl on that URL
String
actualURL
=
matcher
.
group
();
// System.out.println("Attempt: " + i);
String
actualURL
=
matcher
.
group
();
// the URL to be examined
String
domain
=
urlToDomain
(
actualURL
);
// get domain
if
(!
visitedURLs
.
contains
(
domain
)){
// this should prevent the crawler to visit same doamin twice
if
(
checkURL
(
actualURL
))
{
//if URL works, print it, crawl it and increase the index so it knows it has found a link on this level
if
(!
visitedURLs
.
contains
(
actualURL
)){
//check if link works
// System.out.println(checkURL(actualURL));
if
(
checkURL
(
actualURL
))
{
nrOfTotLinks
++;
// System.out.println(nrOfTotLinks);
// System.out.println(actualURL);
visitedURLs
.
add
(
actualURL
);
visitedURLs
.
add
(
domain
);
urlQueue
.
add
(
actualURL
);
System
.
out
.
println
(
spaceBuilder
(
level
)
+
"Level "
+
level
+
" "
+
actualURL
);
...
...
@@ -171,7 +169,7 @@ public class WebCrawler {
i
++;
if
(
level
<
nrOfLevels
)
{
crawl
(
actualURL
,
nrOfLinks
,
nrOfLevels
,
level
+
1
);
crawl
(
actualURL
,
nrOfLinks
,
nrOfLevels
,
level
+
1
);
// crawl the link using recursion
}
}
...
...
@@ -181,21 +179,9 @@ public class WebCrawler {
}
}
}
}
// Each time the regex matches a URL in the HTML,
// add it to the queue for the next traverse and the list of visited URLs.
// breakpoint = getBreakpoint(numberOfRes, matcher, level);
// exit the outermost loop if it reaches the breakpoint.
// if(breakpoint == 0){
//if (level==1) {System.out.println("Crawl finished!");
//}
// break;
// }
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment