#!/usr/bin/perl -w use LWP::Simple; use DBI; require("db_connect.inc"); # In development mode (use dev db, more output), 0 for normal/live $dev="1"; if ($dev) { print "user $userid and password $passwd on host $host \n"; } # details for open database for recording $db="ircount"; #$db="ircount_dev"; $connectionInfo="dbi:mysql:$db;$host"; if ($dev) { $tbl_ircount = "ircount_dev"; $tbl_repositories = "repositories_dev"; } else { $tbl_ircount = "ircount"; $tbl_repositories = "repositories"; } # make connection to database $dbh = DBI->connect($connectionInfo,$userid,$passwd); #make sure we use utf8 for, especially for the archive names #(and make sure tbl and field are utf8 general as well!) $dbh->{'mysql_enable_utf8'} = 1; $dbh->do('SET NAMES utf8'); # open file for recording data open (LOGFILE, ">>repolog.txt") or die "can not open repolog \n"; print LOGFILE "METAMETA\t$year $month $day\tstart\n"; # connect to db and get a list of eprintid numbers we already have in the table $select_sql = "SELECT eprintid from $tbl_repositories"; $sth = $dbh->prepare($select_sql); $sth->execute(); # build hash of eprintids... my %eprintids = (); while (@data = $sth->fetchrow_array()) { $t = $data[0]; $eprintids{$t} = $t; #print "$data[0] eprintids $t is $eprintids{$t} \n"; } # # # get the data from eprints.org # first set the url $URL = "http://roar.eprints.org/rawlist.txt"; # now do the actually getting, easy thanks to lwp::simple $content = get $URL; die "Couldn't get it!" unless defined $content; # split content into seperate lines (@lines) = split /\n/, $content; # set up vars my $data; my $field; my $save; my $id; my $eprintid; my $name; my $type; my $home_page; my $location_country; my $currentdate; my $query; #get current time #which is bloody annoying in perl ($day, $month, $year) = (localtime)[3,4,5]; # perl returns funny values $year += 1900; $month += 1; # and single digit days and months should start with a 0 if ($month < 10) { # we're basically turning it into a string here # having just treated it like an int. # who needs strongly typed languages! $month = "0" . "$month"; } if ($day < 10) { $day = "0" . "$day"; } $currentdate = "$year$month$day"; # open file for recording data open (LOGFILE, ">>repolog.txt") or die "can not open repolog \n"; print LOGFILE "METAMETA\t$year $month $day\tstart\n"; ######################################## # main loop # process the rawlist file from ROAR, # once we have an entire record (we know when we do cos there is a blank line) # see if we already have am entry for this Respository in the db respository table # (we probably do, unless it's new) # we do this my checking to see if the eprintid is in the has 'eprintids' foreach $line(@lines) { # each record has one field per line, and each record # is seperated by an empty line # if this is an empty line we have reached the end of # a record and so output to file/db # if line empty write stuff to file/db... if ($line eq "") { # blank line means we have reached the end of this record # # OK, end of a record (i.e. a blank line) # Do we already have a db record for this repository? # (if we do it will be in the eprintids hash) if ($eprintids{$eprintid}) { # this eprintid exists in the db, for now do nothing... } else { # it must be new. how exciting! # let us add it to the repository table... # (but first write it to the log file).. $save = "$eprintid\t$name"; $save .= "\t$type\t\n"; print LOGFILE "$save"; # write record to db $query = "INSERT INTO $tbl_repositories (eprintid, title,"; $query .= " type,"; $query .= " home_page, location_country "; $query .= ")"; $query .= " VALUES ($eprintid, \"$name\","; $query .= " \"$type\", \"$home_page\", \"$location_country\""; $query .= " )"; $sth = $dbh->prepare($query); $sth->execute(); print "query $query \n\n"; $query = ""; } # clear values ready for next #$oai = ""; $eprintid = ""; $name = ""; $name = ""; $type = ""; $home_page = ""; $location_country = ""; # after all that excitment, it's easy to forget (i did) that we are # in the middle of processing a line from a text file # which so happens to be blank... lets move on to the next line... next; } # We are here cos this isn't a blank line. lets see if we want to grab any # info from this line. # this splits and removes the space after the colon ($field, $data) = split /: /, $line, 2; next if (!$data); next if $data eq ""; print "field $field data $data \n"; if ($field eq "oai") { #$oai = $data; } elsif ($field eq "eprintid") { $eprintid = $data; } elsif ($field eq "title") { $name = $data; } elsif ($field eq "type") { $type = $data; } elsif ($field eq "home_page") { $home_page = $data; } elsif ($field eq "location_country") { $location_country = $data; } } #byeeeeeeeeeeeeeeeeeeeeeeee exit;