Web Grabber
BIBLIOGRAPHY
1. An integrated approach to software engineering – Pankaj Jolote
2. System Analysis and design – Elias M Award
3. Elements of System Analysis – Marvin Gore & John W Stubbe
4. JAVA for dummies
5. JAVA 2: Complete Reference –Herbert schildt
6. Thinking in Java - Bruce Eckel
7. Wikipedia – [Link]
Dept. of Computer Science, Don Bosco College
Web Grabber
Appendix
Screenshots
Web Grabber – Main window
Web Grabber – Enter URL
Dept. of Computer Science, Don Bosco College
Web Grabber
Web Grabber – Enter Level
Web Grabber – Start download
Dept. of Computer Science, Don Bosco College
Web Grabber
Web Grabber –Settings
Web Grabber – Start download
Dept. of Computer Science, Don Bosco College
Web Grabber
Web Grabber – Enter file URL
Web Grabber – Start download
Dept. of Computer Science, Don Bosco College
Web Grabber
Source Code (Java)
HTML parser class
package [Link];
import [Link];
import [Link];
import [Link];
import [Link];
import [Link];
import [Link];
import [Link];
import [Link];
import [Link];
import [Link];
import [Link];
import [Link];
public class ParseHtml {
[Link] doc ;
String siteURL;
public ParseHtml(String urls )
{
siteURL = urls ;
try {
URL url = new URL(urls);
print("Fetching %s...", [Link]());
doc = [Link](url, 10*1000);
}catch (Exception e) {
[Link]("Connection error!!!");
[Link]--;
[Link]();
}
}
public Document getDoc()
{
return doc;
}
public void saveHtml(String path , String content)
{
try{
PrintStream printStream = new PrintStream(path);
[Link](content);
[Link]();
}catch (Exception e) {
[Link]();
}
public String getHtmlCode(){
return [Link]();
}
Dept. of Computer Science, Don Bosco College
Web Grabber
public ArrayList<String> getLinks(){
try {
Elements links = [Link]("a[href]");
ArrayList<String> arr=new ArrayList<String>();
for (Element link : links) {
print("%s", [Link]("abs:href"), trim([Link](), 35));
if([Link]().equals(siteURL)){
[Link]([Link]("abs:href"));
}
}
return arr;
} catch (Exception e) {
// TODO Auto-generated catch block
[Link]();
}
return null;
}
public ArrayList<String> getImageJsLinks(){
try {
Elements media = [Link]("[src]");
ArrayList<String> arr=new ArrayList<String>();
print("\nMedia: (%d)", [Link]());
for (Element src : media) {
if ([Link]().equals("img")){
print(" * %s: <%s> %sx%s (%s)",
[Link](), [Link]("abs:src"), [Link]("width"), [Link]("height"),
trim([Link]("alt"), 20));
[Link]([Link]("abs:src"));
else{
print(" * %s: <%s>", [Link](), [Link]("abs:src"));
[Link]([Link]("abs:src"));
}
return arr;
} catch (Exception e) {
// TODO Auto-generated catch block
[Link]();
}
return null;
public ArrayList<String> getCss(){
try {
Dept. of Computer Science, Don Bosco College
Web Grabber
Elements imports = [Link]("link[href]");
ArrayList<String> import_array=new ArrayList<String>();
for (Element link : imports) {
print(" * %s <%s> (%s)", [Link](),[Link]("abs:href"),
[Link]("rel"));
import_array.add([Link]("abs:href"));
}
[Link](import_array);
return import_array;
} catch (Exception e) {
// TODO Auto-generated catch block
[Link]();
}
return null;
private void print(String msg, Object... args) {
}
private String trim(String s, int width) {
if ([Link]() > width)
return [Link](0, width-1) + ".";
else
return s;
}
Html Grabber class
package [Link];
import [Link];
import [Link];
import [Link];
import [Link];
import [Link];
import [Link];
import [Link];
import [Link];
import [Link];
public class Grabber {
DownloadThread downloadThread;
public static void grabTheSite(String url,boolean getImages,String savPath){
URL urls;
try {
urls = new URL(url); //sets the main url
[Link]("Grabbing :"+url);
String filename=[Link]([Link]('/'));
if([Link](".")){
if([Link]("?")){
filename=[Link](0,[Link]('?'));
}
if(){
filename=filename+".html";
}
Dept. of Computer Science, Don Bosco College
Web Grabber
}
[Link](urls);
String savePath = savPath; //sets the folder path
String content;
ParseHtml parseHtml = new ParseHtml(url);
URL urlf=new URL([Link]().baseUri());
String folder = [Link](); //gets the foldername(domain name)
savePath = savePath + folder;
File baseFolder = new File(savePath);
[Link]();
//[Link]("first page only...");
String path = savePath + "/"+filename; //path of firstpage
ArrayList<String> cssLinks=new ArrayList<String>([Link]());
[Link](cssLinks);
content=[Link]();
content=saveCss(cssLinks, content, savePath);
if(getImages){
ArrayList<String>imageLinks=
new ArrayList<String>([Link]());
[Link](imageLinks);
content=saveImages(imageLinks, content, savePath);
[Link](path, content);
} catch (MalformedURLException e) {
// TODO Auto-generated catch block
[Link]();
}
public static String saveImages(ArrayList<String> imgUrls , String content ,
String savePath)
{
HashSet<String> imgs = new HashSet<String>(imgUrls);
for(String s : imgs)
{
String temp = new String(s);
temp = [Link]([Link]("/"), [Link]());
try {
[Link](new URL(s), new File(savePath+"/images/"+temp));
} catch (Exception e) {
// TODO Auto-generated catch block
[Link]();
String t = "./images/"+temp ;
content = [Link](s, t);
[Link](s+ " " +temp);
Dept. of Computer Science, Don Bosco College
Web Grabber
[Link]("image:"+ temp);
return content;
}
public static String saveCss(ArrayList<String> cssUrls , String content ,
String savePath)
{
HashSet<String> css = new HashSet<String>(cssUrls);
for(String s : css)
{
String temp = new String(s);
temp = [Link]([Link]("/"), [Link]());
try {
[Link](new URL(s), new File(savePath+temp));
} catch (Exception e) {
// TODO Auto-generated catch block
[Link]();
String t = "./"+temp ;
content = [Link](s, t);
[Link](s+ " " +temp);
[Link]("css:" + s);
return content;
}
public void test(int level,String url,String path,boolean getImages ,
DownloadThread thread){
ParseHtml parseHtml=new ParseHtml(url);
HashSet<String> pageLinks=new HashSet<String>([Link]());
if(level==0){
grabTheSite(url, getImages, path);
}
if(level==1){
if([Link] == false){
[Link]("Entering level 1 strategy...");
[Link]("Entering level 1 strategy...");
HashSet<String> cssLinks=new HashSet<String>([Link]());
ArrayList<String> subLinks=new ArrayList<String>();
[Link]("Initialization success....");
[Link]("Initialization success....");
for(String link:pageLinks){
try{
[Link]("getting sublinks of::"+link);
[Link]("getting sublinks of::"+link+" ");
ParseHtml subParseHtml=new ParseHtml(link);
[Link]([Link]());
Dept. of Computer Science, Don Bosco College
Web Grabber
}catch(Exception e){
[Link]("Dead link...!");
[Link]("Dead link...!");
[Link]();
}
}
[Link](subLinks);
}else
{
while([Link] == true){
try {
[Link]().sleep(1000);
} catch (InterruptedException e) {
// TODO Auto-generated catch block
[Link]();
}
[Link]("Thread sleeping");
}
}
for(String link:pageLinks){
try{
if([Link] == false){
grabTheSite(link, getImages, path); }
else
{
while([Link] == true){
[Link]().sleep(1000);
[Link]("Thread sleeping");
}
}
}catch (Exception e) {
[Link]("Error grabbing page :" +link);
[Link]("Error grabbing page :" +link);
}
}
}
if(level==2){
[Link]("Entering level 2 strategy...");
[Link]("Entering level 1 strategy...");
HashSet<String> cssLinks=new HashSet<String>([Link]());
ArrayList<String> subLinks=new ArrayList<String>();
[Link]("Initialization success....");
[Link]("Initialization success....");
for(String link:pageLinks){
try{
[Link]("getting sublinks of::"+link);
[Link]("getting sublinks of::"+link+" ");
ParseHtml subParseHtml=new ParseHtml(link);
[Link]([Link]());
[Link]();
}catch(Exception e){
[Link]("Dead link...!");
[Link]("Dead link...!");
//[Link]();
}
}
[Link](subLinks);
for(String link:pageLinks){
try{
ParseHtml lev2parse=new ParseHtml(link);
[Link]([Link]());
}catch (Exception e) {
[Link]("dead link...");
[Link]("Dead link...! ");
}
Dept. of Computer Science, Don Bosco College
Web Grabber
}
[Link](subLinks);
for(String link:pageLinks){
try{
grabTheSite(link, getImages, path);
}catch (Exception e) {
[Link]("Error grabbing page :" +link);
[Link]("Error grabbing page :" +link+" ");
}
}
}
}
public static void saveFile(String url,String path){
String extension=[Link]([Link](".")-1,[Link]());
String filename=[Link]([Link]('/')+1,[Link]());
if([Link]("?")){
filename=[Link]([Link]("?")+1,[Link]());
}
if([Link]("%")){
filename=[Link]([Link]("%")+1,[Link]());
}
filename="WG"+filename;
try {
[Link]("Downloading :" +filename);
[Link]("Download started!"+ filename);
[Link](new URL(url), new File(path+filename));
[Link]("Finished downloading :" + filename);
} catch (MalformedURLException e) {
// TODO Auto-generated catch block
[Link]();
} catch (IOException e) {
// TODO Auto-generated catch block
[Link]();
}
}
Dept. of Computer Science, Don Bosco College