A Perl script that recursively searches through a directory tree and extracts image URLs from HTML files. Finds all <img src="..."> tags and writes the image URLs to an output file. Useful for scraping image links from websites or HTML archives.
perl extr_img.pl <directory> <output_file>
Recursively searches the directory for HTML files and extracts image URLs matching the pattern <img src="..." border="0" />
#!/usr/bin/perl
#
sub tree {
my(@filenames);
my($root) = $_[0];
opendir ROOT, $root;
my(@filelist) = readdir ROOT;
closedir ROOT;
foreach $name (@filelist) {
if ($name ne '.' and $name ne '..') {
$name = $root . "\\" . $name;
if (-f $name) {
@filenames = (@filenames, $name);
$i++;
}
@filenames = (@filenames, tree($name)) if (-d $name);
}
}
print "searching in $root, $i files so far...\n";
return @filenames;
}
@files = tree($ARGV[0]);
print "opening $ARGV[1]...\n";
open(OUT, ">$ARGV[1]") or die "$ARGV[1] fail...";
foreach $file (@files) {
print "opening $file...\n";
open(FILE, "$file") or die "$file fail...";
while () {
if ($_ =~ /
/im) {
print OUT "$1\n";
}
}
close(FILE);
}
print "closing $ARGV[1]...\n";
close(OUT);