Wednesday, September 1, 2010

Batch Loading Collections into DSpace: Using Perl Scripts for Automation and Quality Control, by Maureen P. Walsh [Appendixes A-E]

Due to space considerations, Appendixes A-D were not included with the published article (http://www.ala.org/ala/mgrps/divs/lita/ital/292010/2903sep/walsh_pdf.cfm).
Appendixes A-D are included below along with Appendix E.



Appendix A. OJS Batch Loading Scripts

-- mkcol.sh --

#!/bin/sh
# Create a Collection given a name and a collection handle.
# Gets information from DSpace web pages and returns data via GET parameters to the DSpace
# Collection Wizard.

NAME="$1"
COLLECTION_HANDLE="$2"

URL="https://kb.osu.edu/dspace"
NAME_PAT=">$NAME</option>"

# Login to DSpace and create the cookie.txt file.
curl -k -L -s $URL/password-login -d "login_email=[name removed]@osu.edu" -d "login_password=XXXXX" -c cookie.txt > /dev/null

# Cut the community_id out of the web page.
COMMUNITY_ID=`curl -k -L -s -b cookie.txt \
   $URL/handle/1811/$COLLECTION_HANDLE \
   | grep -m1 name=\"community_id\" \
   | cut -d\" -f6`

# Cut the collection_id out of the web page.
COLLECTION_ID=`curl -k -L -s -b cookie.txt \
   $URL/tools/collection-wizard \
   -d "community_id=$COMMUNITY_ID" \
   | grep -m1 name=\"collection_id\" \
   | cut -d\" -f6`

# Begin building the collection.
curl -k -L -s -b cookie.txt \
   $URL/tools/collection-wizard \
   -d "public_read=true" \
   -d "workflow1=true" \
   -d "workflow2=" \
   -d "workflow3=" \
   -d "collection_id=$COLLECTION_ID" \
   -d "default-item=" \
   -d "stage=1" \
   -d "admins=" > /dev/null

# Finish making the collection.
curl -k -L -s -b cookie.txt \
   $URL/tools/collection-wizard \
   -F "name=$NAME" \
   -F "short_description=" \
   -F "introductory_text=" \
   -F "copyright_text=" \
   -F "side_bar_text=" \
   -F "provenance_description=" \
   -F "license=" \
   -F "file=" \
   -F "collection_id=$COLLECTION_ID" \
   -F "stage=2" \
   -F "permission=12"  > /dev/null

# Get and return the handle_id.
HANDLE_ID=`curl -k -L -s -b cookie.txt \
   $URL/handle/1811/$COLLECTION_HANDLE \
   | grep -m1 "$NAME_PAT" \
   | cut -d\" -f2`
echo $HANDLE_ID

-------------------------------------------------------------------------------------------------------------------------------

-- mkallcol.pl --

#!/usr/bin/perl

# Routine to clean up individual fields.
sub trim($)
{
    my $string = shift;
    $string =~ s/^\s+//;
    $string =~ s/\s+$//;
    return $string;
}

# Read the file of issue names into an array.
open(fh,"issues-prod.remainder");
@lines=<fh>;
close(fh);

$linenum = 0;
%lt=();

$COMMUNITY = "686";

# For each issue get the parameters from the array and call the script to create the collection.
while ($linenum <= $#lines) {
        @fields = split(/\t/, $lines[$linenum]);
        $issue = $fields[1];
        chop($issue);
        system("echo -n $fields[0] ");
        print " ";
        system("./mkcol.sh $issue $COMMUNITY");
        $linenum++;
}

-- Sample of the file of issue names --

V074N2  "Ohio Journal of Science: Volume  74, Issue 2 (March, 1974)"
V074N3  "Ohio Journal of Science: Volume  74, Issue 3 (May, 1974)"
V074N4  "Ohio Journal of Science: Volume  74, Issue 4 (July, 1974)"
V074N5  "Ohio Journal of Science: Volume  74, Issue 5 (September, 1974)"



-------------------------------------------------------------------------------------------------------------------------------

-- metadata.pl --

#!/usr/bin/perl

use Encode;     # Routines for UTF encoding.

# Routine to clean up individual fields of metadata.
sub trim($)
{
    my $string = shift;
    $string =~ s/^\s+//;
    $string =~ s/\s+$//;
    return $string;
}

# Read the metadata into an array.
open(fh,"<:encoding(UTF-16)", "OJSPhase2-1.txt");
@lines=<fh>;
close(fh);

# Process each line of metadata, consolidating lines for the same item.
$linenum = 0;
%lt=();
while ($linenum <= $#lines) {
        @fields = split(/\t/, $lines[$linenum]);
        if ($fields[0] =~ /^((v|V)[0-9]+(n|N)[0-9A-Za-z]+)/) {
                $lt{uc($1)} = [@{$lt{uc($1)}}, $linenum];
        }
        $linenum++;
}

# Build the load set for each item.
for $key (sort(keys(%lt))) {
        # Put each load set in its own subdirectory.
        print "mkdir ./src/$key\n";
        system "mkdir ./src/$key";
        # Process the lines for this load set.
        for $i (0 .. $#{$lt{$key}}) {
                $dir = sprintf("item_%03d", $i);
                print "mkdir ./src/$key/$dir\n";
                system "mkdir ./src/$key/$dir";
                # Create the XML for the metadata.
                open(fh,">:encoding(UTF-8)", "./src/$key/$dir/dublin_core.xml");
                print fh '<dublin_core>'."\n";
                @fields = split(/\t/, $lines[$lt{$key}[$i]]);
                $fields[1] =~ s/"//g;
                $fields[5] =~ s/"//g;
                if (length($fields[9])>0) {
                    print fh '<dcvalue element="identifier" qualifier="citation">'
                           . "$fields[1]. v$fields[3], n$fields[4] ($fields[5]), $fields[8]-$fields[9]</dcvalue>\n";
                } else {
                    print fh '<dcvalue element="identifier" qualifier="citation">'
                         ."$fields[1]. v$fields[3], n$fields[4] ($fields[5]), $fields[8]</dcvalue>\n";
                }
                if (length($fields[10]) > 0) {
                        $fields[10] =~ s/["]{1}([^"])/$1/g;
                        $fields[10] =~ s/("|"")$//g;
                        print fh '<dcvalue element="title" qualifier="">'.$fields[10]."</dcvalue>\n";
                }
                print fh '<dcvalue element="identifier" qualifier="issn">'.$fields[2]."</dcvalue>\n";
                print fh '<dcvalue element="date" qualifier="issued">'.$fields[6]."-".$fields[7]."</dcvalue>\n";
                # Process multiple authors.
                if (length($fields[11]) > 0) {
                        $fields[11] =~ s/"//g;
                        @authors = split(/;/,$fields[11]);
                        foreach $author (@authors) {
                                $author =~ s/^\s+//;
                                if (length($author) > 0) {
                                        print fh '<dcvalue element="creator" qualifier="">'.$author.'</dcvalue>'."\n";
                                }
                    }
                }
                if (length($fields[12]) > 0) {
                        $fields[12] =~ s/"//g;
                        print fh '<dcvalue element="description" qualifier="">Author Institution: '.$fields[12]."</dcvalue>\n";
                }
                if (length($fields[13]) > 0) {
                        $fields[13] =~ s/"//g;
                        print fh '<dcvalue element="description" qualifier="abstract">'.$fields[13]."</dcvalue>\n";
                }
                print fh "</dublin_core>\n";
                close(fh); # Finished creating the XML file.

                # Create the contents file.
                open(fh, ">./src/$key/$dir/contents");
                $fields[0] = trim($fields[0]);
                print fh "$fields[0].pdf\n";
                close(fh);

                # Move the data files into the load set.
                print "cp pdfs/$fields[0] ./src/$key/$dir\n";
                system "cp pdfs/$fields[0].pdf ./src/$key/$dir";
        }
}

-------------------------------------------------------------------------------------------------------------------------------

-- loaditems.pl --

#!/usr/bin/perl

#Load the list of issues into an array.
open(fh,"loaditems");
@lines=<fh>;
close(fh);

# Process each issue.
$linenum = 0;
while ($linenum <= $#lines) {
        @fields = split(/ /, $lines[$linenum]);
        chop($fields[1]);
        # Add the issue to DSpace.
        system("./import.sh $fields[1] $fields[0]");
        $linenum++;
}

-- Sample of the load items file --

V074N2 1811/22016
V074N3 1811/22017
V074N4 1811/22018
V074N5 1811/22019

-------------------------------------------------------------------------------------------------------------------------------

-- import.sh --

#!/bin/sh

# import.sh collection_id dir
# Import a collection from files generated on dspace
# Requires the directory of the destination collection and the collection id.

COLLECTION_ID=$1
EPERSON=[name removed]@osu.edu
SOURCE_DIR=./src/$2
MAP_DIR=./prod-map/
BASE_ID=`basename $COLLECTION_ID`
MAPFILE=./$MAP_DIR/map.$2
/dspace/bin/dsrun org.dspace.app.itemimport.ItemImport --add --eperson=$EPERSON
--collection=$COLLECTION_ID --source=$SOURCE_DIR --mapfile=$MAPFILE

-------------------------------------------------------------------------------------------------------------------------------

-- intro.pl --

#!/usr/bin/perl

# Routine to clean up individual fields.
sub trim($)
{
    my $string = shift;
    $string =~ s/^\s+//;
    $string =~ s/\s+$//;
    return $string;
}

# Read the metadata into an array.
open(fh,"<:encoding(UTF-16)", "OJSPhase2-1.txt")
    or die "Can't open metadata file: $!";
@lines=<fh>;
close(fh);

# Process each line of metadata, consolidating lines for the same item.
$linenum = 0;
%lt=();
while ($linenum <= $#lines) {
        @fields = split(/\t/, $lines[$linenum]);
        if ($fields[0] =~ /^((v|V)[0-9]+(n|N)[0-9A-Za-z]+)/) {
                $lt{uc($1)} = [@{$lt{uc($1)}}, $linenum];
        }
        $linenum++;
}

# Assemble each intro.
for $key (sort(keys(%lt))) {
        open(fh,"./prod-map/map.$key") or next;
        @fids=<fh>;
        close(fh);
        @fids = sort(@fids);

        print "Generating intro for $key ...\n";
        open(fh,">:encoding(UTF-8)", "./src/$key/intro");

        # Create the HTML for each article.
        for ($i = 0; $i <= $#{$lt{$key}}; $i++) {
                @fields = split(/\t/, $lines[$lt{$key}[$i]]);
                if (length($fields[10]) > 0) {
                        $fields[10] =~ s/["]{1}([^"])/$1/g;
                        $fields[10] =~ s/("|"")$//g;
                        print fh "<strong>$fields[10]</strong><br>\n";
                }
                # Create the list of authors.
                $authcnt = 0;
                if (length($fields[11]) > 0) {
                        $fields[11] =~ s/"//g;
                        @authors = split(/;/,$fields[11]);
                        foreach $author (@authors) {
                                $author =~ s/^\s+//;
                                if ($authcnt > 0) {
                                        print fh "; $author";
                                } else {
                                        print fh $author;
                                }
                                $authcnt++;
                        }
                }
                # Add page numbers.
                if (length($fields[8]) > 0) {
                        print fh " pp. $fields[8]";
                }
                if (length($fields[9]) > 0) {
                        print fh "-$fields[9]";
                }
                print fh "<br>\n";
                # Create links for each article.
                @item_hid = split(/\s/,$fids[$i]);
                $itemno = $item_hid[0];
                $itemhid = $item_hid[1];
                $fields[0] = trim($fields[0]);
                $filename = "./src/$key/$itemno/".$fields[0].".pdf";
                @st = stat($filename) or die "No $filename: $!";
                $size = int($st[7]/1024);
                $url_1 = "/dspace/handle/$itemhid";
                $url_2 = "/dspace/bitstream/$itemhid/1/$fields[0]";
                print fh '<a href="'.$url_1.'">Article description</a> | <a href="'.$url_2.'">Article Full Text PDF ('.$size.'KB)</a><br><br>';
                print fh "\n";
        }
        close(fh);
}

-------------------------------------------------------------------------------------------------------------------------------

-- installintro.sh --

#!/bin/sh
# Install an intro given a dir and a community id.
DIR="$1"
HANDLE="$2"
URL="https://kb.osu.edu/dspace"
# Login to DSpace
curl -k -L -s $URL/password-login -d "login_email=[name removed]@osu.edu" -d "login_password=password" -c cookie.txt > /dev/null

# Cut the community_id out of the web page.
COMMUNITY_ID=`curl -k -L -s -b cookie.txt \
    $URL/handle/$HANDLE \
    | grep -m1 name=\"community_id\" \
    | cut -d\" -f6`

# Cut the collection_id out of the web page.
COLLECTION_ID=`curl -k -L -s -b cookie.txt \
    $URL/handle/$HANDLE \
    | grep -m1 name=\"collection_id\" \
    | cut -d\" -f6`

# Cut the title out of the web page.
TITLE=`curl -k -L -s -b cookie.txt \
    $URL/tools/edit-communities \
    -d "community_id=$COMMUNITY_ID" \
    -d "collection_id=$COLLECTION_ID" \
    -d "action=4" \
    | grep -m1 name=\"name\" \
    | cut -d\" -f6`

# Put the introductory text in DSpace.
curl -k -L -s -b cookie.txt \
    $URL/tools/edit-communities \
    -d "name=$TITLE" \
    -d "short_description=" \
    -d "introductory_text=`cat ./src/$DIR/intro`" \
    -d "copyright_text=" \
    -d "side_bar_text=" \
    -d "license=" \
    -d "provenance_description=" \
    -d "community_id=$COMMUNITY_ID" \
    -d "collection_id=$COLLECTION_ID" \
    -d "create=false" \
    -d "action=9" \
    -d "submit=Update" > /dev/null

-------------------------------------------------------------------------------------------------------------------------------

-- ldallintro.pl --

#!/usr/bin/perl

# Load file of issues into an array.
open(fh,"loaditems");
@lines=<fh>;
close(fh);

$linenum = 0;
%lt=();

# Process each intro.
while ($linenum <= $#lines) {
        @fields = split(/\t/, $lines[$linenum]);
        print("$lines[$linenum]");
        system("./installintro.sh $lines[$linenum] ");
        $linenum++;
}








Appendix B. MSS Phase Two Scripts

-- mkxml2.pl --

#!/usr/bin/perl

# Load routines for UTF-16 and UTF-8
use Encode;

# Routine to clean up metadata fields
sub trim($)
{
    my $string = shift;
    $string =~ s/^\s+//;
    $string =~ s/\s+$//;
    $string =~ s/^"//;
    $string =~ s/"$//;
    return $string;
}

# Load metadata into an array.
open(fh,"<:encoding(UTF-16)", "MSA-phase-2-v3.txt");
@lines=<fh>;
close(fh);

$linenum = 0;
%lt=();

# Split tab separated metadata fields
while ($linenum <= $#lines) {
            @fields = split(/\t/, $lines[$linenum]);
            if ($fields[4] =~ /^([0-9]{4}-[^0-9]+[0-9]+)/) {
                        $lt{$1} = [@{$lt{$1}}, $linenum];
            }
            $linenum++;
}

$cnt1 = 0; $cnt2 = 0; $cnt3 = 0; $cnt4 = 0; $cnt5 = 0; $cnt6 = 0;

# Process metadata line by line
for $key (sort(keys(%lt))) {
        $year =  substr($key, 0, 4);

            # Generate possible image file names.
            $keyzero =  substr($key,0,-1). "0" . substr($key, -1, 1);
            $keyuc =  uc($key);
            $keyuczero =  uc($keyzero);

            # Compensate for inconsistent naming of images in metadata.
        if (-e "../images/$year/$key.jpg") {
                $filename = $key;
        } elsif (-e "../images/$year/$keyzero.jpg") {
                $filename = $keyzero;
        } elsif (-e "../images/$year/$keyuc.jpg") {
                $filename = $keyuc;
        } elsif (-e "../images/$year/$keyuczero.jpg") {
                $filename = $keyuczero;
        } else {
                        $filename = "";
                print " NO FILE FOUND images/$year/$key.jpg\n";
            }

            # Divide output into separate load sets based on year.
        if (($year >= "1946") && ($year <= "1959")) {
            $dir = sprintf("1/item_%04d", $cnt1++);
        }
        if (($year >= "1960") && ($year <= "1969")) {
            $dir = sprintf("2/item_%04d", $cnt2++);
        }
        if (($year >= "1970") && ($year <= "1979")) {
            $dir = sprintf("3/item_%04d", $cnt3++);
        }
        if (($year >= "1980") && ($year <= "1989")) {
            $dir = sprintf("4/item_%04d", $cnt4++);
        }
        if (($year >= "1990") && ($year <= "1999")) {
            $dir = sprintf("5/item_%04d", $cnt5++);
        }
        if (($year >= "2000") && ($year <= "2100")) {
            $dir = sprintf("6/item_%04d", $cnt6++);
        }

            # Make a directory for the item.
            print "mkdir $dir\n";
            system "mkdir $dir";

            # Create XML file from metadata
            open(fh,">:encoding(UTF-8)", "$dir/dublin_core.xml");
            print fh '<dublin_core>'."\n";
            print fh '<dcvalue element="identifier" qualifier="none">'
                .$key.'</dcvalue>'."\n";
            print fh '<dcvalue element="type" qualifier="none">Article</dcvalue>'."\n";
            print fh '<dcvalue element="language" qualifier="iso">en</dcvalue>'."\n";
            $affiliation = '';
            $affiliation1 = '';
            $affiliation2 = '';

            # Metadata for items with multiple authors, each
            # with individual affiliations, span multiple lines.
            # Collect them and produce XML for them.
            for $i (0 .. $#{$lt{$key}}) {
                        @fields = split(/\t/, $lines[$lt{$key}[$i]]);
                        $title = trim($fields[9]);
                        if (length($title) > 0) {
                                    $title =~ s/["]{1}([^"])/$1/g;
                                    $title =~ s/("|"")$//g;
                                    print fh '<dcvalue element="title" qualifier="none">'
                                        .$title.'</dcvalue>'."\n";
                        }
                $year1 = trim($fields[1]);
                        if (length($year1) > 0) {
                                    print fh '<dcvalue element="date" qualifier="issued">'
                                        ."$year</dcvalue>\n";
                        }
                $author = trim($fields[5]);
                        if (length($author) > 0) {
                                    $author =~ s/(\$|\^|\{|\}|\*)//g;
                                    print fh '<dcvalue element="creator" qualifier="none">'
                                        .$author.'</dcvalue>'."\n";
                }
                        $abstract = trim($fields[10]);
                        if (length($abstract) > 0) {
                                    print fh '<dcvalue element="description" qualifier="abstract">'
                                        .$abstract.'</dcvalue>'."\n";
                }
                        if (length(trim($fields[6])) > 0) {
                                    $affiliation1 = trim($fields[6]);
                        }
                        if (length(trim($fields[7])) > 0) {
                                    $affiliation2 = trim($fields[7]);
                        }
                        if ((length(trim($fields[6])) > 0)
                            || (length(trim($fields[7])) > 0)) {
                                    if ((length(trim($fields[6])) == 0)
                                        && (length($affiliation1) == 0)) {
                                                $append = $affiliation2;
                                    } elsif ((length(trim($fields[7])) == 0)
                                        && (length($affiliation2) == 0)) {
                                                $append = $affiliation1;
                                    } else {
                                                $append = $affiliation1.", "
                                                    .$affiliation2;
                                    }
                                    if (length($affiliation) > 0) {
                                                $affiliation = $affiliation.
                                                    "; ".$append;
                                    } else {
                                                $affiliation = $append;
                                    }
                        }
                        $note = trim($fields[11]);
                        if (length($note) > 0) {
                                    print fh '<dcvalue element="description" qualifier="none">'
                                        .$note.'</dcvalue>'."\n";
                }
            } # Done processing multiple authors.

            # Finish producing the XML for this item.
            print fh '<dcvalue element="description" qualifier="none">Author Institution: '
                .$affiliation.'</dcvalue>'."\n";
            print fh '</dublin_core>'."\n";
            close(fh);

            # Create the 'contents' file.
            open(fh, ">$dir/contents");

        if ($filename != "") {
                        print fh "$filename.jpg";
            $cmd = "cp \"../images/$year/$filename.jpg\" $dir";
                        print $cmd."\n";
                        system $cmd;
            }
        close(fh);
} # Finished processing this item.

-------------------------------------------------------------------------------------------------------------------------------

-- import_collections.sh --

#!/bin/sh
#
# Import a collection from files generated on dspace
COLLECTION_ID=1811/6634
EPERSON="[name removed]@osu.edu"
SOURCE_DIR=./5
BASE_ID=`basename $COLLECTION_ID`
MAPFILE=./map.$BASE_ID

/dspace/bin/dsrun org.dspace.app.itemimport.ItemImport --add --eperson=$EPERSON --collection=$COLLECTION_ID --source=$SOURCE_DIR --mapfile=$MAPFILE








Appendix C. Example dublin_core.xml for MSS 2009

<dublin_core>
    <dcvalue element="identifier" qualifier="none">2009-MJ-10</dcvalue>
    <dcvalue element="title" qualifier="none">VIBRATIONAL OVERTONE SPECTRA OF $C_2H_6$ AND $C_2H_4$ IN CRYOGENIC LIQUIDS</dcvalue>
    <dcvalue element="date" qualifier="issued">2009</dcvalue>
    <dcvalue element="description" qualifier="abstract">Vibrational overtone spectra of $C_2H_6$ and $C_2H_4$ in cryogenic solutions were recorded between 5000 and 14000 cm$^{-1}$. Spectral regions for the first four overtones were measured using a Fourier transform spectrophotometer. The fifth overtone $(\Delta\nu=6)$ spectra between 15,000 and 16,000 cm$^{-1}$ were recorded with a double beam (pump-probe) thermal lens technique using concentrations as low as 10$^{-3}$ mole fraction. The peak frequency shift $(\Delta\omega)$ from gas phase to solution is explained by the change in harmonic frequency and anharmonicity in solution with respect to the gas phase values.  The bandwidth $(\Delta\omega_{1/2})$ of the $(\Delta\nu= 6)$ C-H absorption bands in solution can be explained in terms of collisions with the solvent molecules.</dcvalue>
    <dcvalue element="description" qualifier="none">Author Institution: Department of Chemistry and Biochemistry, Baylor University, Waco, Texas, 76798</dcvalue>
    <dcvalue element="type" qualifier="none">Article</dcvalue>
    <dcvalue element="language" qualifier="iso">en</dcvalue>
    <dcvalue element="creator" qualifier="none">Diez-y-Riega, Maria H.</dcvalue>
    <dcvalue element="creator" qualifier="none">Manzanares, Carlos E.</dcvalue>
</dublin_core>








Appendix D. Section of MSS Author Quality Control Script

-- flipper.pl --

#!/usr/bin/perl

#### Sections omitted ####

#### Begin author correction block ####

    $creatorxml = "";
    if (length($creators) > 0) {
             # Creator name are contaminated with comments.
             # Remove the comments.
         $creators =~ s/"//g;
             $creators =~ s/\\thanks\{.+\}//;
             $creators =~ s/\\thanks \{.+\}//;
             $creators =~ s/\\footnote\{.+\}//;
       # Multiple creators are separated by ';' or AND in the metadata.
       @creatorlist = split(/;| and | AND /,$creators);
             # Process each creator.
         foreach $creator (@creatorlist) {
                 # Remove per name comments and punctuation.
           $creator =~ s/^\s+//;
                 $creator =~ s/FULL NAME OF AUTHOR FROM OTHER LOCATION//;
                 $creator =~ s/\\underline \{(.+)\}/$1/;
                 $creator =~ s/\\address\{//;
                 $creator =~ s/\\//g;
                 $creator =~ s/\{//g;
                 $creator =~ s/\}//g;
                 $creator =~ s/\^//g;
                 $creator =~ s/\'//g;
                 $creator =~ s/\%//g;
                 $creator =~ s/^AND$|^and$//;
           if (length($creator) > 0) {
                         $creator =~ s/\.(\w)/. $1/g;
                         # Split the name apart on spaces.
                 @nameparts = split(/ /,$creator);
                         # Process each part of the name.
                         for($i = 0;$i <= $#nameparts; $i++) {
                             # Adjust case.
                             @nameparts[$i] = lc(@nameparts[$i]);
                             @nameparts[$i] = ucfirst(@nameparts[$i]);
                             $c = rindex(@nameparts[$i],"-");
                             # Uppercase hyphenated names.
                             if ($c != -1) {
                                     $r = uc(substr(@nameparts[$i],$c+1,1));
                                     substr(@nameparts[$i],$c+1,1,$r);
                             }
                         }
                         $lname = pop(@nameparts);
                         $nl = @nameparts[-1];
                         # Handle name prefixes.
                         if ($nl eq "Von"
                             || $nl eq "Vander"
                             || $nl eq "Le"
                             || $nl eq "De"
                             || $nl eq "de") {
                             $lname = pop(@nameparts)." ".$lname;
                         }
                         # Handle special case name parts
                         if ($nl eq "Der" ) {
                             $nl2 = @nameparts[-2];
                             $lname = pop(@nameparts)." ".$lname;
                             if ($nl2 eq "Van" ) {
                                     $lname = pop(@nameparts)." ".$lname;
                             }
                         }

                         # assemble the name and make the XML.
                         $name = $lname .", ".join(" ",@nameparts);
                 $creatorxml .= '<dcvalue element="creator" qualifier="">'
                     .$name.'</dcvalue>'."\n    ";
             }
         }
    } # Done processing creators of this item.

  
#### End author correction block ####
#### Sections omitted ####








Appendix E. MSS 2009 Batch Loading Scripts

-- mkxml2009.pl --

#!/usr/bin/perl

use Encode;                 # Routines for UTF encoding
use Text::xSV;              # Routines to process CSV files.
use File::Basename;

# Open and read the comma separated metadata file.
my $csv = new Text::xSV;
#$csv->set_sep('           '); # Use for tab separated files.
$csv->open_file("MSS2009.csv");
$csv->read_header();     # Process the CSV column headers.

# Constants for file and directory names.
$basedir = "/common/batch/input/mss/";
$indir = "$basedir/2009";
$xmldir= "./2009xml";
$imagesubdir= "processed_images";
$filename = "dublin_core.xml";

# Process each line of metadata, one line per item.
$linenum = 1;
while ($csv->get_row()) {
    # This divides the item's metadata into fields, each in its own variable.
    my (
            $identifier,
            $title,
            $creators,
            $description_abstract,
            $issuedate,
            $description,
            $description2,
            $abstract,
            $gif,
            $ppt,
    ) = $csv->extract(
            "Talk_id",
            "Title",
            "Creators",
            "Abstract",
            "IssueDate",
            "Description",
            "AuthorInstitution",
            "Image_file_name",
            "Talk_gifs_file",
            "Talk_ppt_file"
    );

    $creatorxml = "";
    # Multiple creators are separated by ';' in the metadata.
    if (length($creators) > 0) {
            # Create XML for each creator.
        @creatorlist = split(/;/,$creators);
        foreach $creator (@creatorlist) {
            if (length($creator) > 0) {
                $creatorxml .= '<dcvalue element="creator" qualifier="none">'
                .$creator.'</dcvalue>'."\n    ";
             }
         }
    } # Done processing creators for this item.

    # Create the XML string for the Abstract.
    $abstractxml = "";
    if (length($description_abstract) > 0) {
            # Convert special metadata characters for use in xml/html.
        $description_abstract =~ s/\&/&amp;/g;
        $description_abstract =~ s/\>/&gt;/g;
        $description_abstract =~ s/\</&lt;/g;
            # Build the Abstract in XML.
        $abstractxml = '<dcvalue element="description" qualifier="abstract">'
            .$description_abstract.'</dcvalue>';
    }

    # Create the XML string for the Description.
    $descriptionxml = "";
    if (length($description) > 0) {
            # Convert special metadata characters for use in xml/html.
        $description=~ s/\&/&amp;/g;
        $description=~ s/\>/&gt;/g;
        $description=~ s/\</&lt;/g;
            # Build the Description in XML.
        $descriptionxml = '<dcvalue element="description" qualifier="none">'
            .$description.'</dcvalue>';
    }

    # Create the XML string for the Author Institution.
    $description2xml = "";
    if (length($description2) > 0) {
            # Convert special metadata characters for use in xml/html.
        $description2=~ s/\&/&amp;/g;
        $description2=~ s/\>/&gt;/g;
        $description2=~ s/\</&lt;/g;
            # Build the Author Institution XML.
        $description2xml = '<dcvalue element="description" qualifier="none">'
            .'Author Institution: ' .$description2.'</dcvalue>';
    }

    # Convert special characters in title.
    $title=~ s/\&/&amp;/g;
    $title=~ s/\>/&gt;/g;
    $title=~ s/\</&lt;/g;

    # Create XML File
    $subdir = $xmldir."/".$linenum;
    system "mkdir $basedir/$subdir";
    open(fh,">:encoding(UTF-8)", "$basedir/$subdir/$filename");
    print fh <<"XML";
<dublin_core>
    <dcvalue element="identifier" qualifier="none">$identifier</dcvalue>
    <dcvalue element="title" qualifier="none">$title</dcvalue>
    <dcvalue element="date" qualifier="issued">$issuedate</dcvalue>
    $abstractxml
    $descriptionxml
    $description2xml
    <dcvalue element="type" qualifier="none">Article</dcvalue>
    <dcvalue element="language" qualifier="iso">en</dcvalue>
    $creatorxml
</dublin_core>
XML
    close($fh);

# Create contents file and move files to the load set.

    # Copy item files into the load set.
    if (defined($abstract) && length($abstract) > 0) {
        system "cp $indir/$abstract $basedir/$subdir";
    }

    $sourcedir = substr($abstract, 0, 5);
    if (defined($ppt) && length($ppt) > 0 ) {
         system "cp $indir/$sourcedir/$sourcedir/*.* $basedir/$subdir/";
    }
   
    if (defined($gif) && length($gif) > 0 ) {
         system "cp $indir/$sourcedir/$imagesubdir/*.* $basedir/$subdir/";
    }

    # Make the 'contents' file and fill it with the file names.
    system "touch $basedir/$subdir/contents";

    if (defined($gif) && length($gif) > 0
        && -d "$indir/$sourcedir/$imagesubdir" ) {
        # Sort items in reverse order so they show up right in DSpace.
        # This is a hack that depends on how the DB returns items
        # in unsorted (physical) order. There are better ways to do this.
        system "cd $indir/$sourcedir/$imagesubdir/;"
            . " ls *[0-9][0-9].* | sort -r >> $basedir/$subdir/contents";
        system "cd $indir/$sourcedir/$imagesubdir/;"
            . " ls *[a-zA-Z][0-9].* | sort -r  >> $basedir/$subdir/contents";
    }

    if (defined($ppt) && length($ppt) > 0
        && -d "$indir/$sourcedir/$sourcedir" ) {
        system "cd $indir/$sourcedir/$sourcedir/;"
            . " ls *.* >> $basedir/$subdir/contents";
    }
   
    # Put the Abstract in last, so it displays first.
    system "cd $basedir/$subdir; basename $abstract >>"
        . " $basedir/$subdir/contents";

    $linenum++;

} # Done processing an item.

-------------------------------------------------------------------------------------------------------------------------------

-- import.sh --

#!/bin/sh
#
# Import a collection from files generated on dspace
#
COLLECTION_ID=1811/6635
EPERSON=[name removed]@osu.edu
SOURCE_DIR=./2009xml
BASE_ID=`basename $COLLECTION_ID`
MAPFILE=./map-dspace03-mss2009.$BASE_ID

/dspace/bin/dsrun org.dspace.app.itemimport.ItemImport --add --eperson=$EPERSON --collection=$COLLECTION_ID --source=$SOURCE_DIR --mapfile=$MAPFILE 










No comments: