ITALica: Batch Loading Collections into DSpace: Using Perl Scripts for Automation and Quality Control, by Maureen P. Walsh [Appendixes A-E]

Due to space considerations, Appendixes A-D were not included with the published article (http://www.ala.org/ala/mgrps/divs/lita/ital/292010/2903sep/walsh_pdf.cfm).
Appendixes A-D are included below along with Appendix E.

Appendix A. OJS Batch Loading Scripts

-- mkcol.sh --

#!/bin/sh

# Create a Collection given a name and a collection handle.

# Gets information from DSpace web pages and returns data via GET parameters to the DSpace

# Collection Wizard.

NAME="$1"

COLLECTION_HANDLE="$2"

URL="https://kb.osu.edu/dspace"

NAME_PAT=">$NAME</option>"

# Login to DSpace and create the cookie.txt file.

curl -k -L -s $URL/password-login -d "login_email=[name removed]@osu.edu" -d "login_password=XXXXX" -c cookie.txt > /dev/null

# Cut the community_id out of the web page.

COMMUNITY_ID=`curl -k -L -s -b cookie.txt \

$URL/handle/1811/$COLLECTION_HANDLE \

| grep -m1 name=\"community_id\" \

| cut -d\" -f6`

# Cut the collection_id out of the web page.

COLLECTION_ID=`curl -k -L -s -b cookie.txt \

$URL/tools/collection-wizard \

-d "community_id=$COMMUNITY_ID" \

| grep -m1 name=\"collection_id\" \

| cut -d\" -f6`

# Begin building the collection.

curl -k -L -s -b cookie.txt \

$URL/tools/collection-wizard \

-d "public_read=true" \

-d "workflow1=true" \

-d "workflow2=" \

-d "workflow3=" \

-d "collection_id=$COLLECTION_ID" \

-d "default-item=" \

-d "stage=1" \

-d "admins=" > /dev/null

# Finish making the collection.

curl -k -L -s -b cookie.txt \

$URL/tools/collection-wizard \

-F "name=$NAME" \

-F "short_description=" \

-F "introductory_text=" \

-F "copyright_text=" \

-F "side_bar_text=" \

-F "provenance_description=" \

-F "license=" \

-F "file=" \

-F "collection_id=$COLLECTION_ID" \

-F "stage=2" \

-F "permission=12" > /dev/null

# Get and return the handle_id.

HANDLE_ID=`curl -k -L -s -b cookie.txt \

$URL/handle/1811/$COLLECTION_HANDLE \

| grep -m1 "$NAME_PAT" \

| cut -d\" -f2`

echo $HANDLE_ID

-------------------------------------------------------------------------------------------------------------------------------

-- mkallcol.pl --

#!/usr/bin/perl

# Routine to clean up individual fields.

sub trim($)

{

my $string = shift;

$string =~ s/^\s+//;

$string =~ s/\s+$//;

return $string;

}

# Read the file of issue names into an array.

open(fh,"issues-prod.remainder");

@lines=<fh>;

close(fh);

$linenum = 0;

%lt=();

$COMMUNITY = "686";

# For each issue get the parameters from the array and call the script to create the collection.

while ($linenum <= $#lines) {

@fields = split(/\t/, $lines[$linenum]);

$issue = $fields[1];

chop($issue);

system("echo -n $fields[0] ");

print " ";

system("./mkcol.sh $issue $COMMUNITY");

$linenum++;

}

-- Sample of the file of issue names --

V074N2 "Ohio Journal of Science: Volume 74, Issue 2 (March, 1974)"

V074N3 "Ohio Journal of Science: Volume 74, Issue 3 (May, 1974)"

V074N4 "Ohio Journal of Science: Volume 74, Issue 4 (July, 1974)"

V074N5 "Ohio Journal of Science: Volume 74, Issue 5 (September, 1974)"

-------------------------------------------------------------------------------------------------------------------------------

-- metadata.pl --

#!/usr/bin/perl

use Encode; # Routines for UTF encoding.

# Routine to clean up individual fields of metadata.

sub trim($)

{

my $string = shift;

$string =~ s/^\s+//;

$string =~ s/\s+$//;

return $string;

}

# Read the metadata into an array.

open(fh,"<:encoding(UTF-16)", "OJSPhase2-1.txt");

@lines=<fh>;

close(fh);

# Process each line of metadata, consolidating lines for the same item.

$linenum = 0;

%lt=();

while ($linenum <= $#lines) {

@fields = split(/\t/, $lines[$linenum]);

if ($fields[0] =~ /^((v|V)[0-9]+(n|N)[0-9A-Za-z]+)/) {

$lt{uc($1)} = [@{$lt{uc($1)}}, $linenum];

}

$linenum++;

}

# Build the load set for each item.

for $key (sort(keys(%lt))) {

# Put each load set in its own subdirectory.

print "mkdir ./src/$key\n";

system "mkdir ./src/$key";

# Process the lines for this load set.

for $i (0 .. $#{$lt{$key}}) {

$dir = sprintf("item_%03d", $i);

print "mkdir ./src/$key/$dir\n";

system "mkdir ./src/$key/$dir";

# Create the XML for the metadata.

open(fh,">:encoding(UTF-8)", "./src/$key/$dir/dublin_core.xml");

print fh '<dublin_core>'."\n";

@fields = split(/\t/, $lines[$lt{$key}[$i]]);

$fields[1] =~ s/"//g;

$fields[5] =~ s/"//g;

if (length($fields[9])>0) {

print fh '<dcvalue element="identifier" qualifier="citation">'

. "$fields[1]. v$fields[3], n$fields[4] ($fields[5]), $fields[8]-$fields[9]</dcvalue>\n";

} else {

print fh '<dcvalue element="identifier" qualifier="citation">'

."$fields[1]. v$fields[3], n$fields[4] ($fields[5]), $fields[8]</dcvalue>\n";

}

if (length($fields[10]) > 0) {

$fields[10] =~ s/["]{1}([^"])/$1/g;

$fields[10] =~ s/("|"")$//g;

print fh '<dcvalue element="title" qualifier="">'.$fields[10]."</dcvalue>\n";

}

print fh '<dcvalue element="identifier" qualifier="issn">'.$fields[2]."</dcvalue>\n";

print fh '<dcvalue element="date" qualifier="issued">'.$fields[6]."-".$fields[7]."</dcvalue>\n";

# Process multiple authors.

if (length($fields[11]) > 0) {

$fields[11] =~ s/"//g;

@authors = split(/;/,$fields[11]);

foreach $author (@authors) {

$author =~ s/^\s+//;

if (length($author) > 0) {

print fh '<dcvalue element="creator" qualifier="">'.$author.'</dcvalue>'."\n";

}

if (length($fields[12]) > 0) {

$fields[12] =~ s/"//g;

print fh '<dcvalue element="description" qualifier="">Author Institution: '.$fields[12]."</dcvalue>\n";

}

if (length($fields[13]) > 0) {

$fields[13] =~ s/"//g;

print fh '<dcvalue element="description" qualifier="abstract">'.$fields[13]."</dcvalue>\n";

}

print fh "</dublin_core>\n";

close(fh); # Finished creating the XML file.

# Create the contents file.

open(fh, ">./src/$key/$dir/contents");

$fields[0] = trim($fields[0]);

print fh "$fields[0].pdf\n";

close(fh);

# Move the data files into the load set.

print "cp pdfs/$fields[0] ./src/$key/$dir\n";

system "cp pdfs/$fields[0].pdf ./src/$key/$dir";

}

-------------------------------------------------------------------------------------------------------------------------------

-- loaditems.pl --

#!/usr/bin/perl

#Load the list of issues into an array.

open(fh,"loaditems");

@lines=<fh>;

close(fh);

# Process each issue.

$linenum = 0;

while ($linenum <= $#lines) {

@fields = split(/ /, $lines[$linenum]);

chop($fields[1]);

# Add the issue to DSpace.

system("./import.sh $fields[1] $fields[0]");

$linenum++;

}

-- Sample of the load items file --

V074N2 1811/22016

V074N3 1811/22017

V074N4 1811/22018

V074N5 1811/22019

-------------------------------------------------------------------------------------------------------------------------------

-- import.sh --

#!/bin/sh

# import.sh collection_id dir

# Import a collection from files generated on dspace

# Requires the directory of the destination collection and the collection id.

COLLECTION_ID=$1

EPERSON=[name removed]@osu.edu

SOURCE_DIR=./src/$2

MAP_DIR=./prod-map/

BASE_ID=`basename $COLLECTION_ID`

MAPFILE=./$MAP_DIR/map.$2

/dspace/bin/dsrun org.dspace.app.itemimport.ItemImport --add --eperson=$EPERSON

--collection=$COLLECTION_ID --source=$SOURCE_DIR --mapfile=$MAPFILE

-------------------------------------------------------------------------------------------------------------------------------

-- intro.pl --

#!/usr/bin/perl

# Routine to clean up individual fields.

sub trim($)

{

my $string = shift;

$string =~ s/^\s+//;

$string =~ s/\s+$//;

return $string;

}

# Read the metadata into an array.

open(fh,"<:encoding(UTF-16)", "OJSPhase2-1.txt")

or die "Can't open metadata file: $!";

@lines=<fh>;

close(fh);

# Process each line of metadata, consolidating lines for the same item.

$linenum = 0;

%lt=();

while ($linenum <= $#lines) {

@fields = split(/\t/, $lines[$linenum]);

if ($fields[0] =~ /^((v|V)[0-9]+(n|N)[0-9A-Za-z]+)/) {

$lt{uc($1)} = [@{$lt{uc($1)}}, $linenum];

}

$linenum++;

}

# Assemble each intro.

for $key (sort(keys(%lt))) {

open(fh,"./prod-map/map.$key") or next;

@fids=<fh>;

close(fh);

@fids = sort(@fids);

print "Generating intro for $key ...\n";

open(fh,">:encoding(UTF-8)", "./src/$key/intro");

# Create the HTML for each article.

for ($i = 0; $i <= $#{$lt{$key}}; $i++) {

@fields = split(/\t/, $lines[$lt{$key}[$i]]);

if (length($fields[10]) > 0) {

$fields[10] =~ s/["]{1}([^"])/$1/g;

$fields[10] =~ s/("|"")$//g;

print fh "$fields[10] \n";

}

# Create the list of authors.

$authcnt = 0;

if (length($fields[11]) > 0) {

$fields[11] =~ s/"//g;

@authors = split(/;/,$fields[11]);

foreach $author (@authors) {

$author =~ s/^\s+//;

if ($authcnt > 0) {

print fh "; $author";

} else {

print fh $author;

}

$authcnt++;

}

# Add page numbers.

if (length($fields[8]) > 0) {

print fh " pp. $fields[8]";

}

if (length($fields[9]) > 0) {

print fh "-$fields[9]";

}

print fh " \n";

# Create links for each article.

@item_hid = split(/\s/,$fids[$i]);

$itemno = $item_hid[0];

$itemhid = $item_hid[1];

$fields[0] = trim($fields[0]);

$filename = "./src/$key/$itemno/".$fields[0].".pdf";

@st = stat($filename) or die "No $filename: $!";

$size = int($st[7]/1024);

$url_1 = "/dspace/handle/$itemhid";

$url_2 = "/dspace/bitstream/$itemhid/1/$fields[0]";

print fh '<a href="'.$url_1.'">Article description</a> | <a href="'.$url_2.'">Article Full Text PDF ('.$size.'KB)</a> ';

print fh "\n";

}

close(fh);

}

-------------------------------------------------------------------------------------------------------------------------------

-- installintro.sh --

#!/bin/sh

# Install an intro given a dir and a community id.

DIR="$1"

HANDLE="$2"

URL="https://kb.osu.edu/dspace"

# Login to DSpace

curl -k -L -s $URL/password-login -d "login_email=[name removed]@osu.edu" -d "login_password=password" -c cookie.txt > /dev/null

# Cut the community_id out of the web page.

COMMUNITY_ID=`curl -k -L -s -b cookie.txt \

$URL/handle/$HANDLE \

| grep -m1 name=\"community_id\" \

| cut -d\" -f6`

# Cut the collection_id out of the web page.

COLLECTION_ID=`curl -k -L -s -b cookie.txt \

$URL/handle/$HANDLE \

| grep -m1 name=\"collection_id\" \

| cut -d\" -f6`

# Cut the title out of the web page.

TITLE=`curl -k -L -s -b cookie.txt \

$URL/tools/edit-communities \

-d "community_id=$COMMUNITY_ID" \

-d "collection_id=$COLLECTION_ID" \

-d "action=4" \

| grep -m1 name=\"name\" \

| cut -d\" -f6`

# Put the introductory text in DSpace.

curl -k -L -s -b cookie.txt \

$URL/tools/edit-communities \

-d "name=$TITLE" \

-d "short_description=" \

-d "introductory_text=`cat ./src/$DIR/intro`" \

-d "copyright_text=" \

-d "side_bar_text=" \

-d "license=" \

-d "provenance_description=" \

-d "community_id=$COMMUNITY_ID" \

-d "collection_id=$COLLECTION_ID" \

-d "create=false" \

-d "action=9" \

-d "submit=Update" > /dev/null

-------------------------------------------------------------------------------------------------------------------------------

-- ldallintro.pl --

#!/usr/bin/perl

# Load file of issues into an array.

open(fh,"loaditems");

@lines=<fh>;

close(fh);

$linenum = 0;

%lt=();

# Process each intro.

while ($linenum <= $#lines) {

@fields = split(/\t/, $lines[$linenum]);

print("$lines[$linenum]");

system("./installintro.sh $lines[$linenum] ");

$linenum++;

}

Appendix B. MSS Phase Two Scripts

-- mkxml2.pl --

#!/usr/bin/perl

# Load routines for UTF-16 and UTF-8

use Encode;

# Routine to clean up metadata fields

sub trim($)

{

my $string = shift;

$string =~ s/^\s+//;

$string =~ s/\s+$//;

$string =~ s/^"//;

$string =~ s/"$//;

return $string;

}

# Load metadata into an array.

open(fh,"<:encoding(UTF-16)", "MSA-phase-2-v3.txt");

@lines=<fh>;

close(fh);

$linenum = 0;

%lt=();

# Split tab separated metadata fields

while ($linenum <= $#lines) {

@fields = split(/\t/, $lines[$linenum]);

if ($fields[4] =~ /^([0-9]{4}-[^0-9]+[0-9]+)/) {

$lt{$1} = [@{$lt{$1}}, $linenum];

}

$linenum++;

}

$cnt1 = 0; $cnt2 = 0; $cnt3 = 0; $cnt4 = 0; $cnt5 = 0; $cnt6 = 0;

# Process metadata line by line

for $key (sort(keys(%lt))) {

$year = substr($key, 0, 4);

# Generate possible image file names.

$keyzero = substr($key,0,-1). "0" . substr($key, -1, 1);

$keyuc = uc($key);

$keyuczero = uc($keyzero);

# Compensate for inconsistent naming of images in metadata.

if (-e "../images/$year/$key.jpg") {

$filename = $key;

} elsif (-e "../images/$year/$keyzero.jpg") {

$filename = $keyzero;

} elsif (-e "../images/$year/$keyuc.jpg") {

$filename = $keyuc;

} elsif (-e "../images/$year/$keyuczero.jpg") {

$filename = $keyuczero;

} else {

$filename = "";

print " NO FILE FOUND images/$year/$key.jpg\n";

}

# Divide output into separate load sets based on year.

if (($year >= "1946") && ($year <= "1959")) {

$dir = sprintf("1/item_%04d", $cnt1++);

}

if (($year >= "1960") && ($year <= "1969")) {

$dir = sprintf("2/item_%04d", $cnt2++);

}

if (($year >= "1970") && ($year <= "1979")) {

$dir = sprintf("3/item_%04d", $cnt3++);

}

if (($year >= "1980") && ($year <= "1989")) {

$dir = sprintf("4/item_%04d", $cnt4++);

}

if (($year >= "1990") && ($year <= "1999")) {

$dir = sprintf("5/item_%04d", $cnt5++);

}

if (($year >= "2000") && ($year <= "2100")) {

$dir = sprintf("6/item_%04d", $cnt6++);

}

# Make a directory for the item.

print "mkdir $dir\n";

system "mkdir $dir";

# Create XML file from metadata

open(fh,">:encoding(UTF-8)", "$dir/dublin_core.xml");

print fh '<dublin_core>'."\n";

print fh '<dcvalue element="identifier" qualifier="none">'

.$key.'</dcvalue>'."\n";

print fh '<dcvalue element="type" qualifier="none">Article</dcvalue>'."\n";

print fh '<dcvalue element="language" qualifier="iso">en</dcvalue>'."\n";

$affiliation = '';

$affiliation1 = '';

$affiliation2 = '';

# Metadata for items with multiple authors, each

# with individual affiliations, span multiple lines.

# Collect them and produce XML for them.

for $i (0 .. $#{$lt{$key}}) {

@fields = split(/\t/, $lines[$lt{$key}[$i]]);

$title = trim($fields[9]);

if (length($title) > 0) {

$title =~ s/["]{1}([^"])/$1/g;

$title =~ s/("|"")$//g;

print fh '<dcvalue element="title" qualifier="none">'

.$title.'</dcvalue>'."\n";

}

$year1 = trim($fields[1]);

if (length($year1) > 0) {

print fh '<dcvalue element="date" qualifier="issued">'

."$year</dcvalue>\n";

}

$author = trim($fields[5]);

if (length($author) > 0) {

$author =~ s/(\$|\^|\{|\}|\*)//g;

print fh '<dcvalue element="creator" qualifier="none">'

.$author.'</dcvalue>'."\n";

}

$abstract = trim($fields[10]);

if (length($abstract) > 0) {

print fh '<dcvalue element="description" qualifier="abstract">'

.$abstract.'</dcvalue>'."\n";

}

if (length(trim($fields[6])) > 0) {

$affiliation1 = trim($fields[6]);

}

if (length(trim($fields[7])) > 0) {

$affiliation2 = trim($fields[7]);

}

if ((length(trim($fields[6])) > 0)

|| (length(trim($fields[7])) > 0)) {

if ((length(trim($fields[6])) == 0)

&& (length($affiliation1) == 0)) {

$append = $affiliation2;

} elsif ((length(trim($fields[7])) == 0)

&& (length($affiliation2) == 0)) {

$append = $affiliation1;

} else {

$append = $affiliation1.", "

.$affiliation2;

}

if (length($affiliation) > 0) {

$affiliation = $affiliation.

"; ".$append;

} else {

$affiliation = $append;

}

$note = trim($fields[11]);

if (length($note) > 0) {

print fh '<dcvalue element="description" qualifier="none">'

.$note.'</dcvalue>'."\n";

}

} # Done processing multiple authors.

# Finish producing the XML for this item.

print fh '<dcvalue element="description" qualifier="none">Author Institution: '

.$affiliation.'</dcvalue>'."\n";

print fh '</dublin_core>'."\n";

close(fh);

# Create the 'contents' file.

open(fh, ">$dir/contents");

if ($filename != "") {

print fh "$filename.jpg";

$cmd = "cp \"../images/$year/$filename.jpg\" $dir";

print $cmd."\n";

system $cmd;

}

close(fh);

} # Finished processing this item.

-------------------------------------------------------------------------------------------------------------------------------

-- import_collections.sh --

#!/bin/sh

# Import a collection from files generated on dspace

COLLECTION_ID=1811/6634

EPERSON="[name removed]@osu.edu"

SOURCE_DIR=./5

BASE_ID=`basename $COLLECTION_ID`

MAPFILE=./map.$BASE_ID

/dspace/bin/dsrun org.dspace.app.itemimport.ItemImport --add --eperson=$EPERSON --collection=$COLLECTION_ID --source=$SOURCE_DIR --mapfile=$MAPFILE

Appendix C. Example dublin_core.xml for MSS 2009

<dublin_core>
 <dcvalue element="identifier" qualifier="none">2009-MJ-10</dcvalue>
 <dcvalue element="title" qualifier="none">VIBRATIONAL OVERTONE SPECTRA OF $C_2H_6$ AND $C_2H_4$ IN CRYOGENIC LIQUIDS</dcvalue>
 <dcvalue element="date" qualifier="issued">2009</dcvalue>
 <dcvalue element="description" qualifier="abstract">Vibrational overtone spectra of $C_2H_6$ and $C_2H_4$ in cryogenic solutions were recorded between 5000 and 14000 cm$^{-1}$. Spectral regions for the first four overtones were measured using a Fourier transform spectrophotometer. The fifth overtone $(\Delta\nu=6)$ spectra between 15,000 and 16,000 cm$^{-1}$ were recorded with a double beam (pump-probe) thermal lens technique using concentrations as low as 10$^{-3}$ mole fraction. The peak frequency shift $(\Delta\omega)$ from gas phase to solution is explained by the change in harmonic frequency and anharmonicity in solution with respect to the gas phase values. The bandwidth $(\Delta\omega_{1/2})$ of the $(\Delta\nu= 6)$ C-H absorption bands in solution can be explained in terms of collisions with the solvent molecules.</dcvalue>
 <dcvalue element="description" qualifier="none">Author Institution: Department of Chemistry and Biochemistry, Baylor University, Waco, Texas, 76798</dcvalue>
 <dcvalue element="type" qualifier="none">Article</dcvalue>
 <dcvalue element="language" qualifier="iso">en</dcvalue>
 <dcvalue element="creator" qualifier="none">Diez-y-Riega, Maria H.</dcvalue>
 <dcvalue element="creator" qualifier="none">Manzanares, Carlos E.</dcvalue>
</dublin_core>

Appendix D. Section of MSS Author Quality Control Script

-- flipper.pl --

#!/usr/bin/perl

#### Sections omitted ####

#### Begin author correction block ####

$creatorxml = "";

if (length($creators) > 0) {

# Creator name are contaminated with comments.

# Remove the comments.

$creators =~ s/"//g;

$creators =~ s/\\thanks\{.+\}//;

$creators =~ s/\\thanks \{.+\}//;

$creators =~ s/\\footnote\{.+\}//;

# Multiple creators are separated by ';' or AND in the metadata.

@creatorlist = split(/;| and | AND /,$creators);

# Process each creator.

foreach $creator (@creatorlist) {

# Remove per name comments and punctuation.

$creator =~ s/^\s+//;

$creator =~ s/FULL NAME OF AUTHOR FROM OTHER LOCATION//;

$creator =~ s/\\underline \{(.+)\}/$1/;

$creator =~ s/\\address\{//;

$creator =~ s/\\//g;

$creator =~ s/\{//g;

$creator =~ s/\}//g;

$creator =~ s/\^//g;

$creator =~ s/\'//g;

$creator =~ s/\%//g;

$creator =~ s/^AND$|^and$//;

if (length($creator) > 0) {

$creator =~ s/\.(\w)/. $1/g;

# Split the name apart on spaces.

@nameparts = split(/ /,$creator);

# Process each part of the name.

for($i = 0;$i <= $#nameparts; $i++) {

# Adjust case.

@nameparts[$i] = lc(@nameparts[$i]);

@nameparts[$i] = ucfirst(@nameparts[$i]);

$c = rindex(@nameparts[$i],"-");

# Uppercase hyphenated names.

if ($c != -1) {

$r = uc(substr(@nameparts[$i],$c+1,1));

substr(@nameparts[$i],$c+1,1,$r);

}

$lname = pop(@nameparts);

$nl = @nameparts[-1];

# Handle name prefixes.

if ($nl eq "Von"

|| $nl eq "Vander"

|| $nl eq "Le"

|| $nl eq "De"

|| $nl eq "de") {

$lname = pop(@nameparts)." ".$lname;

}

# Handle special case name parts

if ($nl eq "Der" ) {

$nl2 = @nameparts[-2];

$lname = pop(@nameparts)." ".$lname;

if ($nl2 eq "Van" ) {

$lname = pop(@nameparts)." ".$lname;

}

# assemble the name and make the XML.

$name = $lname .", ".join(" ",@nameparts);

$creatorxml .= '<dcvalue element="creator" qualifier="">'

.$name.'</dcvalue>'."\n ";

}

} # Done processing creators of this item.

#### End author correction block ####

#### Sections omitted ####

Appendix E. MSS 2009 Batch Loading Scripts

-- mkxml2009.pl --

#!/usr/bin/perl

use Encode; # Routines for UTF encoding

use Text::xSV; # Routines to process CSV files.

use File::Basename;

# Open and read the comma separated metadata file.

my $csv = new Text::xSV;

#$csv->set_sep(' '); # Use for tab separated files.

$csv->open_file("MSS2009.csv");

$csv->read_header(); # Process the CSV column headers.

# Constants for file and directory names.

$basedir = "/common/batch/input/mss/";

$indir = "$basedir/2009";

$xmldir= "./2009xml";

$imagesubdir= "processed_images";

$filename = "dublin_core.xml";

# Process each line of metadata, one line per item.

$linenum = 1;

while ($csv->get_row()) {

# This divides the item's metadata into fields, each in its own variable.

my (

$identifier,

$title,

$creators,

$description_abstract,

$issuedate,

$description,

$description2,

$abstract,

$gif,

$ppt,

) = $csv->extract(

"Talk_id",

"Title",

"Creators",

"Abstract",

"IssueDate",

"Description",

"AuthorInstitution",

"Image_file_name",

"Talk_gifs_file",

"Talk_ppt_file"

);

$creatorxml = "";

# Multiple creators are separated by ';' in the metadata.

if (length($creators) > 0) {

# Create XML for each creator.

@creatorlist = split(/;/,$creators);

foreach $creator (@creatorlist) {

if (length($creator) > 0) {

$creatorxml .= '<dcvalue element="creator" qualifier="none">'

.$creator.'</dcvalue>'."\n ";

}

} # Done processing creators for this item.

# Create the XML string for the Abstract.

$abstractxml = "";

if (length($description_abstract) > 0) {

# Convert special metadata characters for use in xml/html.

$description_abstract =~ s/\&/&/g;

$description_abstract =~ s/\>/>/g;

$description_abstract =~ s/\</</g;

# Build the Abstract in XML.

$abstractxml = '<dcvalue element="description" qualifier="abstract">'

.$description_abstract.'</dcvalue>';

}

# Create the XML string for the Description.

$descriptionxml = "";

if (length($description) > 0) {

# Convert special metadata characters for use in xml/html.

$description=~ s/\&/&/g;

$description=~ s/\>/>/g;

$description=~ s/\</</g;

# Build the Description in XML.

$descriptionxml = '<dcvalue element="description" qualifier="none">'

.$description.'</dcvalue>';

}

# Create the XML string for the Author Institution.

$description2xml = "";

if (length($description2) > 0) {

# Convert special metadata characters for use in xml/html.

$description2=~ s/\&/&/g;

$description2=~ s/\>/>/g;

$description2=~ s/\</</g;

# Build the Author Institution XML.

$description2xml = '<dcvalue element="description" qualifier="none">'

.'Author Institution: ' .$description2.'</dcvalue>';

}

# Convert special characters in title.

$title=~ s/\&/&/g;

$title=~ s/\>/>/g;

$title=~ s/\</</g;

# Create XML File

$subdir = $xmldir."/".$linenum;

system "mkdir $basedir/$subdir";

open(fh,">:encoding(UTF-8)", "$basedir/$subdir/$filename");

print fh <<"XML";

<dublin_core>

<dcvalue element="identifier" qualifier="none">$identifier</dcvalue>

<dcvalue element="title" qualifier="none">$title</dcvalue>

<dcvalue element="date" qualifier="issued">$issuedate</dcvalue>

$abstractxml

$descriptionxml

$description2xml

<dcvalue element="type" qualifier="none">Article</dcvalue>

$creatorxml

</dublin_core>

XML

close($fh);

# Create contents file and move files to the load set.

# Copy item files into the load set.

if (defined($abstract) && length($abstract) > 0) {

system "cp $indir/$abstract $basedir/$subdir";

}

$sourcedir = substr($abstract, 0, 5);

if (defined($ppt) && length($ppt) > 0 ) {

system "cp $indir/$sourcedir/$sourcedir/*.* $basedir/$subdir/";

}

if (defined($gif) && length($gif) > 0 ) {

system "cp $indir/$sourcedir/$imagesubdir/*.* $basedir/$subdir/";

}

# Make the 'contents' file and fill it with the file names.

system "touch $basedir/$subdir/contents";

if (defined($gif) && length($gif) > 0

&& -d "$indir/$sourcedir/$imagesubdir" ) {

# Sort items in reverse order so they show up right in DSpace.

# This is a hack that depends on how the DB returns items

# in unsorted (physical) order. There are better ways to do this.

system "cd $indir/$sourcedir/$imagesubdir/;"

. " ls *[0-9][0-9].* | sort -r >> $basedir/$subdir/contents";

system "cd $indir/$sourcedir/$imagesubdir/;"

. " ls *[a-zA-Z][0-9].* | sort -r >> $basedir/$subdir/contents";

}

if (defined($ppt) && length($ppt) > 0

&& -d "$indir/$sourcedir/$sourcedir" ) {

system "cd $indir/$sourcedir/$sourcedir/;"

. " ls *.* >> $basedir/$subdir/contents";

}

# Put the Abstract in last, so it displays first.

system "cd $basedir/$subdir; basename $abstract >>"

. " $basedir/$subdir/contents";

$linenum++;

} # Done processing an item.

-------------------------------------------------------------------------------------------------------------------------------

-- import.sh --

#!/bin/sh

# Import a collection from files generated on dspace

COLLECTION_ID=1811/6635

EPERSON=[name removed]@osu.edu

SOURCE_DIR=./2009xml

BASE_ID=`basename $COLLECTION_ID`

MAPFILE=./map-dspace03-mss2009.$BASE_ID

/dspace/bin/dsrun org.dspace.app.itemimport.ItemImport --add --eperson=$EPERSON --collection=$COLLECTION_ID --source=$SOURCE_DIR --mapfile=$MAPFILE

Wednesday, September 1, 2010

Batch Loading Collections into DSpace: Using Perl Scripts for Automation and Quality Control, by Maureen P. Walsh [Appendixes A-E]

No comments:

Blog Archive

Labels