#!/usr/bin/perl

# Convert UNU-INTECH pdf files to dublin_core.xml files

# pdf files assumed to be in current directory

# output files place in DLC/1, DLC/2 etc subdirectories

# Chris George UNU-IIST 8/5/04


$index = 0;  # index of entries; used to name subdirectories

FILE:
    foreach $rdfFile (<*.rdf>) {
	open(RDF, $rdfFile) || die "Can't open $rdfFile: $!\n";
	mkdir('DLC') unless -d 'DLC';
	$index++;
	mkdir("DLC/$index") unless -d "DLC/$index";
	$dlcFile = ">DLC/$index/dublin_core.xml"; # output file
	open(DLC, $dlcFile) || die "Can't open $dlcFile: $!\n";
	print(DLC "<dublin_core>\n");
	$haveElement = 0;
	
      ELEMENT:
	while (<RDF>) {
	    &doLine($_);
	}
	&finish($text, $element, $qualifier) if ($haveElement == 1);
	print DLC "  <dcvalue element=\"language\" qualifier=\"iso\">en</dcvalue>\n";

	print DLC "</dublin_core>\n";
	close DLC;
	close RDF;

	# write empty contents file
	$ctfFile = ">DLC/$index/contents";
	open(CTF, $ctfFile) || die "Can't open $ctfFile: $!\n";
	close CTF;
    }



sub doLine {
    # for each input line, which either starts with a tag or a space
    # if it is a tag (and not ignored):
    # 1. store the remainder of the line in $nextline
    # 2. use finish to output the previous element, if any
    # 3. copy $nextline to $text, where the data of the new entry is collected
    # 4. set the element and qualifier attributes according to the tag
    # else add the line to $text
    local($nextline);
    $_ = @_[0];
    if (/^Template-type:.*/) {&finish($text, $element, $qualifier) if ($haveElement == 1);} # ignore
    elsif (/^Author-Name:\s*(.*)/) {
	$nextline = $1;
	&finish($text, $element, $qualifier) if ($haveElement == 1);
	$text = $nextline;
	$element = 'contributor';
	$qualifier = 'author';
	$haveElement = 1;
    }
    elsif (/^Author-Workplace-Name:.*/) {&finish($text, $element, $qualifier) if ($haveElement == 1);} # ignore
    elsif (/^Author-Email:.*/) {&finish($text, $element, $qualifier) if ($haveElement == 1);} # ignore
    elsif (/^Title:\s*(.*)/) {
	$nextline = $1;
	&finish($text, $element, $qualifier) if ($haveElement == 1);
	$text = $nextline;
	$element = 'title';
	$qualifier = 'none';
	$haveElement = 1;
    }
    elsif (/^Abstract:\s*(.*)/) {
	$nextline = $1;
	&finish($text, $element, $qualifier) if ($haveElement == 1);
	$text = $nextline;
	$element = 'description';
	$qualifier = 'abstract';
	$haveElement = 1;
    }
    elsif (/^Keywords:\s*(.*)/) {
	$nextline = $1;
	&finish($text, $element, $qualifier) if ($haveElement == 1);
	$text = $nextline;
	$element = 'subject';
	$qualifier = 'none';
	$haveElement = 1;
    }
    elsif (/^Classification-JEL:\s*(.*)/) {
	$nextline = $1;
	&finish($text, $element, $qualifier) if ($haveElement == 1);
	$text = $nextline;
	$element = 'subject';
	$qualifier = 'jel';
	$haveElement = 1;
    }
    elsif (/^Series:.*/) {&finish($text, $element, $qualifier) if ($haveElement == 1);} # ignore
    elsif (/^Creation-Date:\s*(.*)/) {
	$nextline = $1;
	&finish($text, $element, $qualifier) if ($haveElement == 1);
	$text = $nextline;
	$element = 'date';
	$qualifier = 'issued';
	$haveElement = 1;
    }
    elsif (/^Number:.*/) {&finish($text, $element, $qualifier) if ($haveElement == 1);} # ignore
    elsif (/^File-URL:\s*(.*)/) {
	$nextline = $1;
	&finish($text, $element, $qualifier) if ($haveElement == 1);
	$text = $nextline;
	$element = 'identifier';
	$qualifier = 'uri';
	$haveElement = 1;
    }
    elsif (/^File-Format:\s*(.*)/) {
	$nextline = $1;
	&finish($text, $element, $qualifier) if ($haveElement == 1);
	$text = $nextline;
	$element = 'format';
	$qualifier = 'mimetype';
	$haveElement = 1;
    }
    elsif (/^File-Size:\s*(.*)/) {
	$nextline = $1;
	&finish($text, $element, $qualifier) if ($haveElement == 1);
	$text = "PDF: $nextline";
	$element = 'format';
	$qualifier = 'extent';
	$haveElement = 1;
     }
    elsif (/^Handle:\s*(.*)/) {
	$nextline = $1;
	&finish($text, $element, $qualifier) if ($haveElement == 1);
	$text = "UNU-INTECH;$nextline";
	$element = 'relation';
	$qualifier = 'ispartofseries';
	$haveElement = 1;
    }
    elsif (/^\s+(.*)/) { # line starting with a blank; append text
	$text .= " $1" if ($haveElement == 1); # provided not in an ignored entry
    }
    elsif (/^([^:]+).*/) {
	$nextline = $1;
	&finish($text, $element, $qualifier) if ($haveElement == 1);
	print STDOUT "Unexpected element $nextline in $rdfFile\n";
    }
    else {
	$nextline = $_;
	&finish($text, $element, $qualifier) if ($haveElement == 1);
	print STDOUT "Unexpected line $nextline in $rdfFile\n";
    }
}

sub finish {
    # 1st parameter contains all the element, possible split into several lines
    # 2nd and 3rd parameters are element and qualifier attributes
    local($text);
    local($element);
    local($qualifier);
    # copy parameters to local variables
    ($text, $element, $qualifier) = @_;

    # make text into one line (see Programming Perl p267)
    
    $text =~ s/([.!?])\n\s*/$1 /g;
    $text =~ s/\n\s*/ /g;
    $text =~ s/^\s*//;

    # fix special characters
    # don't leave semi-colon as it is a separator for keywords,
    # but do < and > before inserting xml separators
    $text =~ s/</&ltxxsemixx/g;
    $text =~ s/>/&gtxxsemixx/g;
    $text =~ s/&/&ampxxsemixx/g;
    $text =~ s/\'/&aposxxsemixx/g;
    $text =~ s/\"/&quotxxsemixx/g;

    # split keywords: currently in form k1; k2; k3
    # comma separator also accepted
    if (($element eq 'subject') && ($qualifier eq 'none')) {
	# get array of keywords
	@kws = split(/[;,]\s*/, $text);
	# make into separate elements as a single string
	$text = join("</dcvalue>\n  <dcvalue element=\"subject\" qualifier=\"none\">", @kws);
    }

    # semicolons back in
    $text =~ s/xxsemixx/;/g;

    # output the element

    printf DLC "  <dcvalue element=\"%s\" qualifier=\"%s\">%s</dcvalue>\n", $element, $qualifier, $text;

    # record element as completed
    $haveElement = 0;
}
	   
	
