User:TheDJ/grin importer.pl
Jump to navigation
Jump to search
#!/usr/bin/perl use strict; use warnings; use LWP 5.64; use LWP::Simple; use utf8; use Encode; use Time::HiRes qw (sleep); sub trim($) { my $string = shift; $string =~ s/^\s+//; $string =~ s/\s+$//; return $string; } my $browser = LWP::UserAgent->new(); print "======================\n"; open( DATEI, ">bot.sh") || die "Datei nicht gefunden"; print DATEI "#!/bin/bash\n"; chmod (0777,"bot.sh"); close( DATEI ); open( ERRLOG, ">errorlog.txt" ) || die "Errorlog creation failed"; close( ERRLOG ); open( GPNS, "gpn.txt" ) || die "Missing the GRIN numbers"; my $gpnfile_line; while( $gpnfile_line = <GPNS> ) { my $gpnid = trim($gpnfile_line); my $abstract_url = "http://grin.hq.nasa.gov/ABSTRACTS/".$gpnid.".html"; my $remote_file = "http://dayton.hq.nasa.gov/IMAGES/LARGE/".$gpnid.".jpg"; # Downloading abstract page print( "Looking for ".$gpnid." at ".$abstract_url."\n"); my $abstract_website = $browser->get("http://grin.hq.nasa.gov/ABSTRACTS/".$gpnid.".html"); if( ! $abstract_website->is_success ) { open( ERRLOG, ">>errorlog.txt" ) || die "Errorlog failed"; print ERRLOG "Failed to download abstract for ".$gpnid.". Error was: ".$abstract_website->status_line."\n"; close( ERRLOG ); } my $seite_code = $abstract_website->content(); $seite_code =~ s/<br \/>\r?\n//gi; Encode::from_to($seite_code, "iso-8859-1", "utf8"); # Retrieve title my $abstract_title; if( $seite_code =~ m/<\!--\ ONE-LINE-DESCRIPTION-BEGIN\ -->([\s\S]*?)<\!--\ ONE-LINE-DESCRIPTION-END\ -->/m ) { $abstract_title = trim($1); } else { die "Could not find title" ; } #print( "Title: ".$abstract_title."\n"); # Retrieve description my $abstract_full_description; if( $seite_code =~ m/<\!--\ DESCRIPTION-BEGIN\ -->([\s\S]*?)<\!-- \DESCRIPTION-END/m ) { $abstract_full_description = trim($1); # Handle line endings and paragraphs. $abstract_full_description =~ s/\n\n/P9@/gi; $abstract_full_description =~ s/\n//gi; $abstract_full_description =~ s/P9@/\n\n/gi; $abstract_full_description =~ s/ / /gi; # double space to single space } else { $abstract_full_description = "[[Category:GRIN images detection errors]]" } #print( "Description: ".$abstract_full_description."\n"); # Retrieve author my $abstract_creator=""; if( $seite_code =~ m/Creator\/Photographer:<\/B>([\s\S]*?)<LI>/mi ) { $abstract_creator = trim($1); } else { $abstract_creator = "[[Category:GRIN images detection errors]]"; } #print( "Creator: ".$abstract_creator."\n" ); # Retrieve reference ID info my $centerid=""; if( $seite_code =~ m/<\!--\ OTHERNUMBER-BEGIN\ -->([\s\S]*?)<\!--\ OTHERNUMBER-END\ -->/m ) { $centerid = trim($1); } else { $centerid = "[[Category:GRIN images detection errors]]"; } #print( "CenterID: ".$centerid."\n" ); my $centershort=""; if( $seite_code =~ m/<\!--\ CENTER-BEGIN\ -->([\s\S]*?)<\!--\ CENTER-END\ -->/m ) { $centershort = trim($1); } else { $centershort = "[[Category:GRIN images detection errors]]"; } #print( "Center: ".$centershort."\n" ); if( lc($centershort) eq lc("MSFC") ) { $centerid = "MSFC-".$centerid; } # Check if it's likely USGov-NASA, put in a cat in case it needs to be checked by humans. my $permission; if( $abstract_creator =~ m/^NA[SC]A/i ) { $permission = "{{PD-USGov-NASA}}"; } else { $permission = "[[Category:GRIN images requiring copyright evaluation]]"; } # Retrieve the date my $abstract_date; if( $seite_code =~ m/DATE-BEGIN\ --([\s\S]*?)--\ DATE-END/m ) { $abstract_date = trim($1); my $yyyy = substr($abstract_date,0,4); my $mm = substr($abstract_date,4,2); my $dd = substr($abstract_date,6,2); $abstract_date="{{date|".$yyyy."|".$mm."|".$dd."}}"; } else { $abstract_date = "[[Category:GRIN images detection errors]]"; } # Check for possible dupes my $searchquery = "http://commons.wikimedia.org/w/api.php?action=query&list=search&srwhat=text&srnamespace=6&format=xml&srsearch=".$gpnid; my $searchresult = $browser->get( $searchquery ); my $duperesult = ""; if( $searchresult->is_success ) { $searchresult = $searchresult->content(); if( !($searchresult =~ m/<search\ \/>/i )) { print "possible DUPE\n"; $duperesult = "\n[[Category:GRIN possible dupes]]"; } } # Assemble the final description for the page my @description = (); $description[0] = "{{Information"; $description[1] = "|Description={{en|1=".$abstract_full_description."}}"; $description[2] = "|Source=[".$remote_file." Great Images in NASA] [".$abstract_url." Description]"; $description[3] = "|Date=".$abstract_date; $description[4] = "|Author=".$abstract_creator; $description[5] = "|Permission=".$permission; $description[6] = "|other_versions="; $description[7] = "}}"; $description[8] = "{{NASA-image|id=".$gpnid."|alternateid=".$centerid."|center=".$centershort."}}"; $description[9] = ""; $description[10] = "{{subst:unc}}"; $description[11] = "[[Category:Great Images in NASA]]".$duperesult; my $description = join("\n",@description); # Retrieve the image itself my $local_file = $abstract_title." - ".$gpnid.".jpg"; # print( "Going to retrieve file ".$remote_file." and store it as ".$local_file."\n"); # my $returncode = getstore( $remote_file, $local_file); # if( $returncode != 200 ) # { # open( ERRLOG, ">>errorlog.txt" ) || die "Errorlog failed"; # print ERRLOG "Failed to download ".$gpnid.". Error was: ".$returncode."\n"; # close( ERRLOG ); # } # Bash normalization of the strings $description =~ s/\"/\\\"/g; $local_file =~ s/\"/\\\"/g; $description =~ s/`/'/g; $local_file =~ s/`/'/g; # Normalizations done by the mediawiki upload $local_file =~ s/ /_/gi; $local_file =~ s/#/-/g; $local_file =~ s/:/-/g; $local_file =~ s/\//-/g; # Write the upload command for this file to bot.sh open( DATEI, ">>bot.sh") || die "Datei nicht gefunden"; print DATEI "python2.5 /home/multichill/pywikipedia/upload.py -keep -noverify -filename:\"".$local_file."\" \"".$remote_file."\" \"".$description."\"\n"; close( DATEI ); sleep( 0.25 ); } close( GPNS );