MEL install and database mapping
From HISCOM
This page describes one approach to the process of installing a BioCase provider and mirroring a texpress database to use as the data provider. In summary, a single mysql table is used to represent the texpress data, while a number of abcd tables are created. Then there is a big SQL script that runs from a cron script that does the updating from the single table to the abcd tables.
Contents |
[edit]
MySQL mirror
The Texpress database is mirrored in the following single table mysql database:
CREATE TABLE specimens (
additcol VARCHAR(90) NULL, # additional collectors
altm INT NULL, # altitude in metres (ft not needed)
authorit VARCHAR(80) NULL, # the species authority
basio2 VARCHAR(66) NULL, # basionym 2
basio3 VARCHAR(66) NULL, # basionym 3
basionym VARCHAR(66) NULL, # oldest basionym
bru VARCHAR(30) NULL, # TDWG level 4
carpolco CHAR(2) NULL, # carpological collection
ciba CHAR(2) NULL, # ciobachrome
class CHAR(2) NULL, # Y if a dry sheet
collcode CHAR(3) NULL, # internal use for unknown collectors
colldat_1 INT NULL, # collection date day
colldat_2 CHAR(3) NULL, # colleciton month
colldat_3 INT NULL, # collection year
collectr VARCHAR(65) NULL, # collector
colourtr CHAR(2) NULL, # colour transparency
confdat_1 INT NULL, # day of confirmation
confdat_2 CHAR(3) NULL, # month of confirmation
confdat_3 INT NULL, # year of confirmation
confirme VARCHAR(30) NULL, # who confirmed specimen
country VARCHAR(30) NULL, # TDWG level 2
cultivat CHAR(2) NULL, # Cultivated specimen
datecrea DATE NULL, # date record created
dateedit DATE NULL, # date record last edited
declat DOUBLE NULL, # decimal lat created on fly
declong DOUBLE NULL, # decimal long created on fly
depth INT NULL,
detby VARCHAR(30) NULL, # determiner
detdat_1 INT NULL, # determined on day DD
detdat_2 CHAR(3) NULL, # determined on month MMM
detdat_3 INT NULL, # determined on YEAR YYYY
duplicat VARCHAR(135) NULL, # destination of replicates
duptypst VARCHAR(15) NULL, # type status of dupliates
exsiccat VARCHAR(25) NULL, # who donated specimen
extra CHAR(2) NULL, # there is extra info on the specimen
family VARCHAR(35) NULL,
genus VARCHAR(25) NULL,
geonum TINYINT NULL, # TDWG Level 1 number
georeg VARCHAR(20) NULL, # TDWG level 1
gridrefe_1 CHAR(1) NULL, # major grid
gridrefe_2 TINYINT NULL, # minor grid
gridrefe_3 CHAR(1) NULL, # major grid 2
gridrefe_4 TINYINT NULL, # minor grid 2
habit TEXT NULL,
habitat TEXT NULL,
hcommonn VARCHAR(60) NULL, # host common name
HERBKEY_1 VARCHAR(10) NOT NULL, # PRIMARY KEY
HERBKEY_2 INT NOT NULL, # PRIMARY KEY
HERBKEY_3 CHAR(1) NOT NULL, # PRIMARY KEY
hgenus VARCHAR(30) NULL, # host genus
hinfrasp VARCHAR(35) NULL, # host infraspecific name
hortref CHAR(2) NULL, # part of hort ref collection
hostfami VARCHAR(35) NULL, # host family
hspecies VARCHAR(40) NULL, # host species
illustra CHAR(2) NULL, # an illustration with specimen
infrank VARCHAR(5) NULL, # infraspecific rank
infraspn VARCHAR(50) NULL, # infraspecific name
infraspq VARCHAR(10) NULL, # infraspecific qualifier
infrauth VARCHAR(58) NULL, # infraspecific authority
intro CHAR(2) NULL, # if species is introduced
lat1_1 SMALLINT NULL, # lat degrees
lat1_2 TINYINT NULL, # lat minutes
lat1_3 TINYINT NULL, # lat seconds
lat1_4 CHAR(1) NULL, # lat dir
letter CHAR(2) NULL, # the jar size
lichenla CHAR(2) NULL, # if barcodes have been printed
locality TEXT NULL,
long1_1 SMALLINT NULL, # long degrees
long1_2 TINYINT NULL, # long minutes
long1_3 TINYINT NULL, # long seconds
long1_4 CHAR(1) NULL, # long direction
melfamil SMALLINT NULL, # the internal numbering of families
mixednot VARCHAR(150) NULL, # notes to distinguish parts of a mixed coll.
multiple VARCHAR(150) NULL, # notes on how this might relate to another rec
native CHAR(2) NULL, # is a native species
neg CHAR(2) NULL, # a photographic negative with specimen
newloan_1 SMALLINT NULL, # the year component of the loan number
newloan_2 SMALLINT NULL, # the number component of the loan number
notes TEXT NULL,
number VARCHAR(12) NULL, # the collectors field number
origherb CHAR(5) NULL, # which herb has the specimen for this image
phenolog_1 CHAR(2) NULL, # FL if flowers
phenolog_2 CHAR(2) NULL, # FR if fruits
phenolog_3 CHAR(2) NULL, # BD if buds
phenolog_4 CHAR(2) NULL, # LL if leafless
phenolog_5 CHAR(2) NULL, # FE if fertile
phenolog_6 CHAR(2) NULL, # ST if sterile
photo CHAR(2) NULL, # a photo with specimen
precisio TINYINT NULL, # the precision of the geocode
priornam VARCHAR(135) NULL, # equiv to name comments
prot3 VARCHAR(64) NULL, # protologue 3
proto2 VARCHAR(64) NULL, # protologue 2
protolog VARCHAR(100) NULL, # the oldest protologue
seedling VARCHAR(12) NULL, # a cross refereced seedling number
slide CHAR(2) NULL, # if there is a slide
slideno VARCHAR(10) NULL, # the slide number
source TINYINT NULL, # source of collection
species VARCHAR(50) NULL,
spirit CHAR(2) NULL, # if there is a spirit collection
spiritco VARCHAR(10) NULL, # the spirit number
spqual VARCHAR(10) NULL, # species qualifier
state VARCHAR(28) NULL, # TDWG level 3
substrat VARCHAR(75) NULL, # substrate
supragrp CHAR(2) NULL, # supra group F,D,M,A,B,H etc
synonym VARCHAR(67) NULL, # no longer used
transferred DATETIME NULL, # date transferred
typedby VARCHAR(8) NULL, # who created record
typeof VARCHAR(67) NULL, # no longer used
typest3 VARCHAR(38) NULL, # type status 3
typesta2 VARCHAR(38) NULL, # type status 2
typestat VARCHAR(20) NULL, # oldest type status
typey CHAR(2) NULL, # if this is a type
userid VARCHAR(8) NULL, # who last edited record
voucher CHAR(2) NULL, # is a voucher for something
INDEX (genus, species, infraspn),
INDEX (family),
INDEX (newloan_1, newloan_2),
INDEX (supragrp),
INDEX (detby),
INDEX (confirme),
INDEX (collectr, number),
INDEX (colldat_3, colldat_2,colldat_1),
INDEX (georeg),
INDEX (country),
INDEX (state),
INDEX (bru),
INDEX (declat, declong),
INDEX (locality(200)),
INDEX (habitat(200)),
INDEX (habit(200)),
INDEX (notes(200)),
INDEX (phenolog_1),
INDEX (phenolog_2),
INDEX (phenolog_3),
INDEX (phenolog_4),
INDEX (phenolog_5),
INDEX (phenolog_6),
INDEX (typedby),
INDEX (userid),
INDEX (datecrea),
INDEX (dateedit),
PRIMARY KEY (HERBKEY_1, HERBKEY_2, HERBKEY_3)
);
[edit]
ABCD tables
The following tables were created to better reflect the structure of ABCD
[edit]
abcd_area
CREATE TABLE abcd_area ( area_name varchar(150) default NULL, area_class varchar(30) default NULL, unitID varchar(10) default NULL, areaID int(11) NOT NULL auto_increment, PRIMARY KEY (areaID), KEY unitID (unitID) )
[edit]
abcd_biotope
CREATE TABLE abcd_biotope ( unitID varchar(10) default NULL, parameter varchar(30) default NULL, value text, is_quantitative tinyint(4) default NULL, biotopeID int(11) NOT NULL auto_increment, PRIMARY KEY (biotopeID), KEY unitID (unitID) )
[edit]
abcd_collector
CREATE TABLE abcd_collector ( collector_name varchar(255) default NULL, sequence int(11) default NULL, primary_flag int(11) default NULL, unitID varchar(10) default NULL, collectorID int(11) NOT NULL auto_increment, PRIMARY KEY (collectorID), KEY unitID (unitID) )
[edit]
abcd_higher_taxon
CREATE TABLE abcd_higher_taxon ( htID int(11) NOT NULL auto_increment, name varchar(80) default NULL, rank varchar(20) default NULL, unitID varchar(10) default NULL, PRIMARY KEY (htID), KEY unitID (unitID) )
[edit]
abcd_identification
CREATE TABLE abcd_identification ( unitID varchar(10) default NULL, higher_taxon_name varchar(30) default NULL, higher_taxon_rank varchar(20) default NULL, scientific_name varchar(100) default NULL, genus varchar(30) default NULL, name_addendum varchar(50) default NULL, species varchar(30) default NULL, author_team varchar(30) default NULL, author_team_parenthesis varchar(30) default NULL, rank varchar(10) default NULL, infraspecies varchar(30) default NULL, verification_level varchar(20) default NULL, identifiers_text varchar(40) default NULL, identification_date date default NULL, idenfifiers_role varchar(30) default NULL, identification_qualifier varchar(20) default NULL, identification_qualifier_insert int(11) default NULL, name_comments varchar(200) default NULL, KEY unitID (unitID) )
[edit]
abcd_metadata
CREATE TABLE abcd_metadata ( metaID int(11) NOT NULL default '0', dataset_title varchar(100) default NULL, technical_contact_name varchar(100) default NULL, date_modified date default NULL, owner varchar(100) default NULL, source_id varchar(10) default NULL, PRIMARY KEY (metaID), KEY source_id (source_id) )
[edit]
abcd_typification
CREATE TABLE abcd_typification ( unitID varchar(10) default NULL, type_status varchar(20) default NULL, basionym varchar(100) default NULL, KEY unitID (unitID) )
[edit]
abcd_unit
CREATE TABLE abcd_unit (
unitID varchar(10) NOT NULL default ,
herbarium_source varchar(6) NOT NULL default ,
scientific_name varchar(255) default NULL,
collector_number varchar(15) default NULL,
collection_date date default NULL,
generalised_locality varchar(30) default NULL,
declat_fudged float default NULL,
declong_fudged float default NULL,
geocode_source enum('collector','compiler','generalised arbitrary point','automatically generated') default NULL,
geocode_precision_fudged int(11) default NULL,
genus varchar(30) default NULL,
species varchar(30) default NULL,
rank varchar(15) default NULL,
infraspecies varchar(30) default NULL,
date_last_edited date default NULL,
family varchar(30) default NULL,
declat double default NULL,
declong double default NULL,
geocode_precision int(11) default NULL,
locality text,
alt int(11) default NULL,
depth int(11) default NULL,
coordinates_text varchar(100) default NULL,
spatial_datum varchar(10) default NULL,
notes text,
country varchar(40) default NULL,
duplicates varchar(153) default NULL,
donor varchar(25) default NULL,
cultivated char(2) default NULL,
PRIMARY KEY (unitID),
KEY scientific_name_index (scientific_name),
KEY latlong (declat_fudged,declong_fudged)
)
[edit]
The conversion script
This SQL script gets run to convert the data in the specimen table into the various abcd tables (note, this is still a work in progress, so not all data elements are being created just yet).
-- in this script, the data will be set by using sed to change
-- the <date> value
SET @datefrom = <date>;
-- load the unit table
REPLACE LOW_PRIORITY INTO abcd_unit (
unitID,
herbarium_source,
scientific_name,
genus,
species,
rank,
infraspecies,
family,
collector_number,
collection_date,
generalised_locality,
declat_fudged,
declong_fudged,
geocode_precision_fudged,
declat,
declong,
geocode_precision,
geocode_source,
date_last_edited,
locality,
alt,
depth,
coordinates_text,
spatial_datum,
notes,
country,
duplicates,
donor,
cultivated)
SELECT
-- unitID
CONCAT(HERBKEY_2,HERBKEY_3),
-- herbarium_source
"MEL",
-- scientific_name
CONCAT_WS(" ", genus, species, infrank, infraspn),
-- genus
genus,
-- species
species,
-- rank
CASE infrank WHEN "ssp" THEN "subsp."
WHEN "ssp." THEN "subsp."
WHEN "subsp" THEN "subsp."
WHEN "f" THEN "f."
WHEN "forma" THEN "f."
WHEN "var" THEN "var."
ELSE NULL
END
,
-- infraspecies
infraspn,
--family
family,
-- collectors_number
number,
-- collection_date
CONCAT_WS("-",
colldat_3,
CASE colldat_2 WHEN "Jan" THEN 01
WHEN "Feb" THEN 02
WHEN "Mar" THEN 03
WHEN "Apr" THEN 04
WHEN "May" THEN 05
WHEN "Jun" THEN 06
WHEN "Jul" THEN 07
WHEN "Aug" THEN 08
WHEN "Sep" THEN 09
WHEN "Oct" THEN 10
WHEN "Nov" THEN 11
WHEN "Dec" THEN 12
ELSE NULL
END,
colldat_1),
-- generalised_locality
"Nearest locality not available",
-- declat_fudged
CASE lat1_4 WHEN "S" THEN -1.00 WHEN "N" THEN 1.00 END *
(lat1_1 * 1.00)
+ ((FLOOR(lat1_2/10.0) * 10.0)/60.00),
-- declong_fudged
CASE long1_4 WHEN "W" THEN -1.00 WHEN "E" THEN 1.00 END *
(long1_1 * 1.00)
+ ((FLOOR(long1_2/10.0) * 10.0)/60.00),
-- geocode_precision_fudged
-- (note adding in extra for fudging of geocode)
CASE precisio WHEN 1 THEN 50
WHEN 2 THEN 1000
WHEN 3 THEN 10000
WHEN 4 THEN 250000
WHEN 5 THEN 110000
ELSE NULL
END
+ (18500),
CASE lat1_4 WHEN "S" THEN -1.00
WHEN "N" THEN 1.00 END
* (lat1_1 * 1.000)
+ (lat1_2/60.00)
+ (lat1_3/3600.0),
-- declong
CASE long1_4 WHEN "W" THEN -1.00 WHEN "E" THEN 1.00 END
* (long1_1 * 1.000)
+ (long1_2/60.00)
+ (long1_3/3600.0),
-- geocode_precision
CASE precisio WHEN 1 THEN 50
WHEN 2 THEN 1000
WHEN 3 THEN 10000
WHEN 4 THEN 250000
WHEN 5 THEN 110000
ELSE NULL END ,
-- geocode_source
CASE source WHEN 1 THEN "collector"
WHEN 2 THEN "compiler"
WHEN 3 THEN "compiler"
WHEN 4 THEN "collector"
ELSE NULL
END,
-- date_last_edited
dateedit,
-- locality
locality,
-- alt
altm,
-- depth
depth,
-- coordinates_text
IF(lat1_1 OR long1_1,CONCAT_WS(",",lat1_1, lat1_2, lat1_3, lat1_4, long1_1, long1_2, long1_3, long1_4),NULL),
-- spatial_datum
"gda94",
-- notes
notes,
-- country
country,
-- duplicates
duplicat,
-- donor
exsiccat,
-- cultivated
IF(cultivat = "Y", "G", "W")
FROM specimens WHERE dateedit >= @datefrom;
-- now load the abcd_collector table
-- but first delete records that relate to the data we are loading
DELETE LOW_PRIORITY from abcd_collector USING abcd_collector, abcd_unit
WHERE abcd_unit.unitID = abcd_collector.unitID AND abcd_unit.date_last_edited >= @datefrom;
REPLACE LOW_PRIORITY INTO abcd_collector (
collector_name,
sequence,
primary_flag,
unitID)
SELECT
collectr, -- collector_name
1, -- sequence
1, -- primary collector flag
CONCAT(HERBKEY_2,HERBKEY_3)-- unitID
FROM specimens WHERE dateedit >= @datefrom;
-- delete old abcd_area data
DELETE LOW_PRIORITY FROM abcd_area USING abcd_area, abcd_unit
WHERE abcd_unit.unitID = abcd_area.unitID AND abcd_unit.date_last_edited >= @datefrom;
-- load in state
REPLACE LOW_PRIORITY INTO abcd_area (
area_name,
area_class,
unitID)
SELECT
bru,
"State",
CONCAT(HERBKEY_2,HERBKEY_3)
FROM specimens
WHERE bru IS NOT NULL AND dateedit >= @datefrom;
-- load in country
REPLACE LOW_PRIORITY INTO abcd_area (
area_name,
area_class,
unitID)
SELECT
country,
"Country",
CONCAT(HERBKEY_2,HERBKEY_3)
FROM specimens
WHERE bru IS NOT NULL AND dateedit >= @datefrom;
-- delete LOW_PRIORITY from biotope
DELETE FROM abcd_biotope USING abcd_biotope, abcd_unit
WHERE abcd_unit.unitID = abcd_biotope.unitID AND abcd_unit.date_last_edited >= @datefrom;
-- load into biotope
INSERT LOW_PRIORITY INTO abcd_biotope (unitID,parameter,value,is_quantitative)
SELECT CONCAT(HERBKEY_2,HERBKEY_3),"habitat",habitat,0
FROM specimens where habitat IS NOT NULL AND dateedit >= @datefrom;
INSERT LOW_PRIORITY INTO abcd_biotope (unitID,parameter,value,is_quantitative)
SELECT CONCAT(HERBKEY_2,HERBKEY_3),"form",habit,0
FROM specimens where habit IS NOT NULL AND dateedit >= @datefrom;
-- load in district (botanical region / district)
REPLACE LOW_PRIORITY INTO abcd_area (
area_name,
area_class,
unitID)
SELECT
regiondi,
"sru",
CONCAT(HERBKEY_2,HERBKEY_3)
FROM specimens WHERE regiondi IS NOT NULL AND dateedit >= @datefrom;
-- load into higher_taxon
DELETE LOW_PRIORITY from abcd_higher_taxon USING abcd_higher_taxon, abcd_unit
WHERE abcd_unit.unitID = abcd_higher_taxon.unitID AND abcd_unit.date_last_edited >= @datefrom;
REPLACE LOW_PRIORITY INTO abcd_higher_taxon(
name, rank, unitID)
SELECT
family, "family", CONCAT(HERBKEY_2,HERBKEY_3)
FROM specimens WHERE family IS NOT NULL AND dateedit >= @datefrom;
-- update the date last updated in the metadata
update abcd_metadata SET date_modified = NOW() WHERE source_id = "MEL";
