From 7b5518cbf6af971396c877a2f9fa9cc962283acc Mon Sep 17 00:00:00 2001 From: Sophie Yang <78716301+yangsophieee@users.noreply.github.com> Date: Thu, 14 Dec 2023 14:42:54 +1100 Subject: [PATCH] Add GitHub chapter (#17) * add github chapter * add versioned_releases chapter --------- Co-authored-by: ehwenk --- _quarto.yml | 8 ++-- check_dataset_functions.qmd | 4 +- database_structure.qmd | 14 +++---- github.qmd | 61 +++++++++++++++++++++++++++ traits_build.qmd | 2 +- tutorial_dataset_1.qmd | 64 ++++++++++++++--------------- tutorial_dataset_2.qmd | 40 +++++++++--------- tutorial_dataset_3.qmd | 6 +-- tutorial_dataset_4.qmd | 82 ++++++++++++++++++------------------- tutorial_dataset_5.qmd | 12 +++--- tutorial_dataset_6.qmd | 4 +- tutorial_dataset_7.qmd | 14 +++---- versioned_releases.qmd | 39 ++++++++++++++++++ 13 files changed, 226 insertions(+), 124 deletions(-) create mode 100644 github.qmd create mode 100644 versioned_releases.qmd diff --git a/_quarto.yml b/_quarto.yml index 9b2cbe4..73764c3 100644 --- a/_quarto.yml +++ b/_quarto.yml @@ -4,7 +4,7 @@ project: book: title: "The {traits.build} data standard, R package, and workflow" author: ["Elizabeth Wenk", "Daniel Falster", "Sophie Yang", "Fonti Kar"] - page-footer: "How to get help:
Copyright 2023, Daniel Falster and Elizabeth Wenk" + page-footer: "How to get help:
Copyright 2023, Daniel Falster and Elizabeth Wenk" page-navigation: true chapters: - part: Introduction @@ -13,7 +13,7 @@ book: - motivation.qmd - workflow.qmd - usage_examples.qmd - - part: Data structure and standard + - part: Data structure and standard chapters: - long_wide.qmd - database_standard.qmd @@ -41,6 +41,8 @@ book: - adding_data_long.qmd - check_dataset_functions.qmd - data_common_issues.qmd + - github.qmd + - versioned_releases.qmd - part: Using outputs of `traits.build` chapters: - austraits_database.qmd @@ -56,7 +58,7 @@ book: appendices: - csv.qmd - yaml.qmd - + format: html: theme: cosmo diff --git a/check_dataset_functions.qmd b/check_dataset_functions.qmd index ee9bb48..875a2f0 100644 --- a/check_dataset_functions.qmd +++ b/check_dataset_functions.qmd @@ -161,7 +161,7 @@ dataset_check_outlier_by_species <- function(database, dataset, trait, multiplie comparisons) %>% dplyr::filter(as.numeric(value) > multiplier*mean_value | as.numeric(value) < (1/multiplier)*mean_value) %>% dplyr::mutate(value_ratio = as.numeric(value)/mean_value) %>% - dplyr::arrange(value_ratio) + dplyr::arrange(dplyr::desc(value_ratio)) need_review @@ -213,7 +213,7 @@ dataset_check_outlier_by_genus <- function(database, dataset, trait, multiplier) comparisons) %>% dplyr::filter(as.numeric(value) > multiplier*mean_value | as.numeric(value) < (1/multiplier)*mean_value) %>% dplyr::mutate(value_ratio = as.numeric(value)/mean_value) %>% - dplyr::arrange(value_ratio) + dplyr::arrange(dplyr::desc(value_ratio)) need_review diff --git a/database_structure.qmd b/database_structure.qmd index 9016a76..a6f731b 100644 --- a/database_structure.qmd +++ b/database_structure.qmd @@ -241,7 +241,7 @@ elements <- schema$austraits$elements$excluded_data writeLines(c("")) ``` -## taxa +## Taxa **Description:** A table containing details on taxa that are included in the table [`traits`](#traits). We have attempted to align species names with known taxonomic units in the [`Australian Plant Census` (APC)](https://biodiversity.org.au/nsl/services/apc) and/or the [`Australian Plant Names Index` (APNI)](https://biodiversity.org.au/nsl/services/APNI); the sourced information is released under a CC-BY3 license. @@ -256,7 +256,7 @@ elements <- schema$austraits$elements$taxa writeLines(c("")) ``` -## taxonomic_updates +## Taxonomic_updates ```{r} elements <- schema$austraits$elements$taxonomic_updates @@ -273,7 +273,7 @@ elements <- schema$austraits$elements$taxonomic_updates Both the original and the updated taxon names are included in the [`traits`](#traits) table. -## definitions +## Definitions ```{r} elements <- schema$austraits$elements$definitions @@ -300,7 +300,7 @@ for (trait in c("leaf_mass_per_area", "woodiness")) { } ``` -## contributors +## Contributors ```{r} elements <- schema$austraits$elements$contributors @@ -315,7 +315,7 @@ elements <- schema$austraits$elements$contributors writeLines(c("")) ``` -## sources +## Sources For each dataset in the compilation there is the option to list primary and secondary citations. The primary citation is defined as, `r austraits$schema$metadata$elements$source$values$primary$description` The secondary citation is defined as, `r austraits$schema$metadata$elements$source$values$secondary$description` @@ -334,7 +334,7 @@ austraits$sources["Falster_2005_1"] A formatted version of the sources also exists within the table [methods](#methods). -## metadata +## Metadata ```{r} elements <- schema$austraits$elements$metadata @@ -346,7 +346,7 @@ elements <- schema$austraits$elements$metadata writeLines(c("")) ``` -## build_info +## Build_info ```{r} elements <- schema$austraits$elements$build_info diff --git a/github.qmd b/github.qmd new file mode 100644 index 0000000..4e8a398 --- /dev/null +++ b/github.qmd @@ -0,0 +1,61 @@ +# Using GitHub + +## Working with your GitHub repository + +For {traits.build} users, the preferred way of hosting your database is on GitHub. + +### Setting up the repository + +There are some GitHub settings we recommend: +- `General`: Enable "Always suggest updating pull request branches" to keep the branch up to date with the main branch before merging +- `General`: Enable "Automatically delete head branches" to delete the branch after merging, which keeps your branches clean +- `Branches`: Add a branch protection rule for your main or develop branch and enable "Require a pull request before merging", "Require conversation resolution before merging", "Require deployments to succeed before merging" + +#### Automated tests during pull requests + +To run automated tests that must pass before a pull request can be merged in, you can set up GitHub workflows via the Actions tab on GitHub. The setting "Require deployments to succeed before merging" must be enabled for the `main` or `develop` branch. You can write your own workflows which are stored in `.github/workflows/`. For {austraits.build}, the GitHub workflow runs `dataset_test` on all data sources and compiles the database (see [here](https://github.com/traitecoevo/austraits.build/blob/51964dbe4d302c6dade51db133e9e32514cddaae/.github/workflows/check-build.yml)). + + +### Adding to the repository + +New data can be added to the repository by creating a branch and then opening a [pull request](https://help.github.com/articles/using-pull-requests/) (PR). Those who want to contribute but aren't approved maintainers of the database, must fork and clone the database from GitHub. + +In short, + +1. Create a Git branch for your new work, either within the repo (if you are an approved contributor) or as a [fork of the repo](https://help.github.com/en/github/getting-started-with-github/fork-a-repo). +2. Make commits and push these up onto the branch. +3. Make sure everything runs fine before you send a PR (see [tutorials for adding datasets](tutorial_datasets.html)). +4. Submit the PR and tag someone as a reviewer. +5. Squash and merge the PR once approved and any changes have been made. + +**Tip**: For working with git and GitHub, we recommend GitHub Desktop, a user-friendly graphical interface tool. + +#### Merging a pull request + +The easiest way to merge a PR is to use GitHub's built-in options for squashing and merging. This leads to: + +- A single commit +- The work is attributed to the original author + +You can merge in a PR after it has been approved. To merge a PR, you need to be an approved maintainer. You do not need to be the original author of the PR (the commit will still be by the original author). + +1. Send the PR. +2. Tag someone to review. +3. If there are any updates to the main branch, merge those into your new branch and resolve any conflicts. +4. Once ready, merge into the main branch, choosing "Squash & Merge", using an informative commit message. "Squash" merges all your commits on the branch into one. + +##### Commit messages + +Informative commit messages are ideal. They should clearly describe the work done and value added to the database in a few, clear, bulleted points. If relevant, they should reference any GitHub issues. You can [link to and directly close GitHub issues via the commit message](https://docs.github.com/en/issues/tracking-your-work-with-issues/linking-a-pull-request-to-an-issue#linking-a-pull-request-to-an-issue-using-a-keyword). To link to another commit you can also use the SHA-hash or its 7-character prefix. + +An example commit message: + +``` +Smith_1996: Add study +- For #224, closes #286 +- Trait data for Nothofagus forests across Australia, New Zealand and South America +``` + +## Bugs and feature requests for {traits.build} + +If you find a bug or have a feature request for {traits.build}, [file a GitHub issue](https://github.com/traitecoevo/traits.build/issues) on {traits.build}. Illustrate the bug with a minimal [reprex](https://www.tidyverse.org/help/#reprex) (reproducible example). Please feel free to contribute by implementing the fix or feature via pull request. For substantial pull requests, it is best to first check with the {traits.build} team that it's worth pursuing the problem. diff --git a/traits_build.qmd b/traits_build.qmd index d72bd2f..4331538 100644 --- a/traits_build.qmd +++ b/traits_build.qmd @@ -8,7 +8,7 @@ The core components of the `{traits.build}` package are: 1. 15 functions functions, supplemented by a detailed [protocol](tutorial_datasets.html) to wrangle diverse datasets into input files with a common structure that captures both the trait data and all essential metadata and context properties. These are a table (data.csv) containing all trait data, taxon names, location names (if relevant), and any context properties (if relevant) and a structured metadata file (metadata.yml) that assigns the columns from the `data.csv` file to their specific variables and maps all additional dataset metadata in a structured format. -2. An R-based pipeline to combine the input files into a single harmonised database with aligned trait names, aligned units, aligned categorical trait values, and aligned taxon names. Four dataset-specific configuration files are required for the build process, 1) a trait dictionary; 2) a units conversion file; 3) a taxon list; and 4) a database metadata file. +2. An R-based pipeline to combine the input files into a single harmonised database with aligned trait names, aligned units, aligned categorical trait values, and aligned taxon names. Four database-specific configuration files are required for the build process, 1) a trait dictionary; 2) a units conversion file; 3) a taxon list; and 4) a database metadata file. Guided by the information in the configuration files, the R-scripted workflow combines the `data.csv` and `metadata.yml` files for the individual datasets into a unified, harmonised database. There are three distinct steps to this process, processed by a trio of functions, `dataset_configure`, `dataset_process`, and `dataset_taxonomic_updates`. These functions cyclically build each dataset, only combining them into a single database at the end of the workflow. diff --git a/tutorial_dataset_1.qmd b/tutorial_dataset_1.qmd index c1dbe31..0ab8ef5 100644 --- a/tutorial_dataset_1.qmd +++ b/tutorial_dataset_1.qmd @@ -12,7 +12,7 @@ Before you begin this tutorial, ensure you have installed traits.build, cloned t - Learn how to [merge a new dataset](#build_pipline) into a `traits.build` database. -### New Functions Introduced +### New functions introduced - metadata_create_template @@ -40,7 +40,7 @@ In the traits.build-template repository, there is a folder titled `tutorial_data - There is a folder `raw` nested within the `tutorial_dataset_1` folder, that contains one file, `notes.txt`.  -### source necessary functions +### Source necessary functions - Source the functions in the `traits.build` package: @@ -140,7 +140,7 @@ A follow-up question then allows you to add a fixed `collection_date` as a range [Enter collection_date range in format '2007/2009':]{style="color:blue;"} [**2002-11/2002-11**]{style="color:red;"}\ -A final user prompt asks if, for any traits, a sequence of rows represents repeat observations.\ +A final user prompt asks if, for any traits, a sequence of rows represents repeat observations.\ [Do all traits need `repeat_measurements_id`'s?]{style="color:blue;"} @@ -168,7 +168,7 @@ metadata_add_source_doi(dataset_id = "tutorial_dataset_1", doi = "10.1111/j.0022 The following information is automatically propagated into the source field: ```{r, eval=FALSE} -primary: +primary: key: Test_1 bibtype: Article year: '2005' @@ -286,30 +286,30 @@ You select columns 3, 4, 5, as these contain trait data. ```{r, eval=FALSE} traits: -- var_in: LMA (mg mm-2) - unit_in: unknown - trait_name: unknown - entity_type: unknown - value_type: unknown - basis_of_value: unknown - replicates: unknown - methods: unknown -- var_in: Leaf nitrogen (mg mg-1) - unit_in: unknown - trait_name: unknown - entity_type: unknown - value_type: unknown - basis_of_value: unknown - replicates: unknown - methods: unknown -- var_in: leaf size (mm2) - unit_in: unknown - trait_name: unknown - entity_type: unknown - value_type: unknown - basis_of_value: unknown - replicates: unknown - methods: unknown +- var_in: LMA (mg mm-2) + unit_in: unknown + trait_name: unknown + entity_type: unknown + value_type: unknown + basis_of_value: unknown + replicates: unknown + methods: unknown +- var_in: Leaf nitrogen (mg mg-1) + unit_in: unknown + trait_name: unknown + entity_type: unknown + value_type: unknown + basis_of_value: unknown + replicates: unknown + methods: unknown +- var_in: leaf size (mm2) + unit_in: unknown + trait_name: unknown + entity_type: unknown + value_type: unknown + basis_of_value: unknown + replicates: unknown + methods: unknown ``` ------------------------------------------------------------------------ @@ -401,7 +401,7 @@ If the units being read in for a specific trait differ from those defined for th #### **Final steps** -##### **double check the metadata.yml file** +##### **Double check the metadata.yml file** You should now have a completed `metadata.yml` file, with no `unknown` fields. @@ -409,7 +409,7 @@ You'll notice five sections we haven't used, `contexts`, `substitutions`, `taxon These should each contain an `.na` (as in `substitutions: .na`). They will be explored in future lessons. -##### **run tests on the metadata file** +##### **Run tests on the metadata file** Confirm there are no errors in the `metadata.yml` file: @@ -421,7 +421,7 @@ This *should* result in the following output: [\[ FAIL 0 \| WARN 0 \| SKIP 0 \| PASS 79 \]]{style="color:blue;"}\ -##### **add dataset to the database** {#build_pipline} +##### **Add dataset to the database** {#build_pipeline} Next add the dataset_id to the build file that builds the database and rebuild the database @@ -430,7 +430,7 @@ build_setup_pipeline(method = "base", database_name = "traits.build_database") source("build.R") ``` -##### **build dataset report** +##### **Build dataset report** As a final step, build a report for the study diff --git a/tutorial_dataset_2.qmd b/tutorial_dataset_2.qmd index d59773a..1c38ad1 100644 --- a/tutorial_dataset_2.qmd +++ b/tutorial_dataset_2.qmd @@ -18,7 +18,7 @@ Before you begin this tutorial, ensure you have installed traits.build, cloned t - Understand the importance of having the [dataset pivot](#dataset_pivot). -### New Functions Introduced +### New functions introduced - metadata_add_substitution @@ -125,14 +125,14 @@ Then rename your columns to match those in use: locations <- locations %>% rename( - `longitude (deg)` = long, - `latitude (deg)` = lat, - `description` = vegetation, - `elevation (m)` = elevation, - `precipitation, MAP (mm)` = MAP, - `soil P, total (mg/kg)` = `soil P`, - `soil N, total (ppm)` = `soil N`, - `geology (parent material)` = `parent material` + `longitude (deg)` = long, + `latitude (deg)` = lat, + `description` = vegetation, + `elevation (m)` = elevation, + `precipitation, MAP (mm)` = MAP, + `soil P, total (mg/kg)` = `soil P`, + `soil N, total (ppm)` = `soil N`, + `geology (parent material)` = `parent material` ) ``` @@ -407,7 +407,7 @@ custom_R_code: ' mutate( across(c("TRAIT Leaf Dry Mass UNITS g"), ~na_if(.x,0)) ) %>% - group_by(name_original) %>% + group_by(name_original) %>% mutate( across(c("TRAIT Growth Form CATEGORICAL EP epiphyte (mistletoe) F fern G grass H herb S shrub T tree V vine"), replace_duplicates_with_NA) ) %>% @@ -427,23 +427,23 @@ Then rebuild the database and look at the output in the traits table for one of source("build.R") traits.build_database$traits %>% - filter(dataset_id == "tutorial_dataset_2") %>% + filter(dataset_id == "tutorial_dataset_2") %>% filter(taxon_name == "Actinotus minor") %>% View() dataset_id taxon_name observation_id trait_name value unit entity_type location_id - -1 tutorial_dataset_2 Actinotus minor 010 leaf_area 18.8 mm2 population 02 -2 tutorial_dataset_2 Actinotus minor 010 leaf_dry_mass 7 mg population 02 -3 tutorial_dataset_2 Actinotus minor 010 leaf_mass_per_area 344.827586206897 g/m2 population 02 -4 tutorial_dataset_2 Actinotus minor 011 leaf_area 75.9 mm2 population 03 -5 tutorial_dataset_2 Actinotus minor 011 leaf_dry_mass 7 mg population 03 -6 tutorial_dataset_2 Actinotus minor 011 leaf_mass_per_area 89.2857142857143 g/m2 population 03 -7 tutorial_dataset_2 Actinotus minor 012 plant_growth_form herb NA species NA + +1 tutorial_dataset_2 Actinotus minor 010 leaf_area 18.8 mm2 population 02 +2 tutorial_dataset_2 Actinotus minor 010 leaf_dry_mass 7 mg population 02 +3 tutorial_dataset_2 Actinotus minor 010 leaf_mass_per_area 344.827586206897 g/m2 population 02 +4 tutorial_dataset_2 Actinotus minor 011 leaf_area 75.9 mm2 population 03 +5 tutorial_dataset_2 Actinotus minor 011 leaf_dry_mass 7 mg population 03 +6 tutorial_dataset_2 Actinotus minor 011 leaf_mass_per_area 89.2857142857143 g/m2 population 03 +7 tutorial_dataset_2 Actinotus minor 012 plant_growth_form herb NA species NA ``` The measurements for the three numeric traits from a single location share a common `observation_id`, as they are all part of an observation of a common entity (a specific population of *Actinotus minor*), at a single location, at a single point in time. However the row with the plant growth form measurement has a separate `observation_id` reflecting that this is an observation of a different entity (the taxon *Actinotus minor*). -##### **build dataset report** +##### **Build dataset report** As a final step, build a report for the study diff --git a/tutorial_dataset_3.qmd b/tutorial_dataset_3.qmd index 79b6a2c..ae1ca9f 100644 --- a/tutorial_dataset_3.qmd +++ b/tutorial_dataset_3.qmd @@ -14,7 +14,7 @@ Before you begin this tutorial, ensure you have installed traits.build, cloned t - Learn additional [custom_R_code tricks](#custom_R_code).  -### New Functions Introduced +### New functions introduced - metadata_add_contexts @@ -303,7 +303,7 @@ Looking at the excluded_data table indicates there is a "\*" in one column, so o ```{r, eval=FALSE} custom_R_code: ' data %>% - mutate( + mutate( across(c("WP leaf (Mpa) predawn"), ~na_if(.x,"*")) ) ' @@ -317,7 +317,7 @@ This indicates a mismatch between column types, necessitating that you change th custom_R_code: ' data %>% mutate( - across(c("WP leaf (Mpa) predawn"), ~as.character(.x)), + across(c("WP leaf (Mpa) predawn"), ~as.character(.x)), across(c("WP leaf (Mpa) predawn"), ~na_if(.x,"*")) ) ' diff --git a/tutorial_dataset_4.qmd b/tutorial_dataset_4.qmd index 3b5f878..4e3dea2 100644 --- a/tutorial_dataset_4.qmd +++ b/tutorial_dataset_4.qmd @@ -19,7 +19,7 @@ Before you begin this tutorial, ensure you have installed traits.build, cloned t - Learn how to [exclude data](#exclude_data).  -### New Functions Introduced +### New functions introduced - none. @@ -29,11 +29,11 @@ Before you begin this tutorial, ensure you have installed traits.build, cloned t This dataset is a subset of data from Togashi_2015 in AusTraits. The data are a compilation of many datasets, each with their own references information.\ -For such datasets, there are two options: 1) sourcing the data from the original publication, and adding it to the database as its own datset; 2) entering the dataset as part of a broader compilation. For this study, in AusTraits, a combination of both approaches was used. For original sources that included a broader range of traits and similar (or better) resolution, the original source was used. However, in this compilation there were several studies where Togashi had already individually contacted the authors of the source publications, as the data appendix for this publication often had better resolution than the original papers.\ +For such datasets, there are two options: 1) sourcing the data from the original publication, and adding it to the database as its own datset; 2) entering the dataset as part of a broader compilation. For this study, in AusTraits, a combination of both approaches was used. For original sources that included a broader range of traits and similar (or better) resolution, the original source was used. However, in this compilation there were several studies where Togashi had already individually contacted the authors of the source publications, as the data appendix for this publication often had better resolution than the original papers.\ -This tutorial focuses on adding a single dataset derived from many original studies.\ +This tutorial focuses on adding a single dataset derived from many original studies.\ -Before you begin creating the metadata file, take a look at the data.csv file - you'll notice that the columns present are quite different from the datasets added so far. There are columns for latitude & longitude, but not location name column. There are columns for the original source. These are columns for entity_type, basis_of_record, and value_type, three metadata fields that, for the previous tutorial datasets, were entered in the metadata file as a fixed value. And the only trait column `log_LA.SA` is in a non-standard format. +Before you begin creating the metadata file, take a look at the data.csv file - you'll notice that the columns present are quite different from the datasets added so far. There are columns for latitude & longitude, but not location name column. There are columns for the original source. These are columns for entity_type, basis_of_record, and value_type, three metadata fields that, for the previous tutorial datasets, were entered in the metadata file as a fixed value. And the only trait column `log_LA.SA` is in a non-standard format. With a few small tricks this dataset can also be added seamlessly. @@ -58,11 +58,11 @@ source("R/custom_R_code.R") ### Create the metadata.yml file -- Note, that because, for this dataset, there are a number of variables that cannot simply be adding using the `metadata_add_...` functions, unlike in previous tutorials, you'll now mix the three steps that were previously separated:\ +- Note, that because, for this dataset, there are a number of variables that cannot simply be adding using the `metadata_add_...` functions, unlike in previous tutorials, you'll now mix the three steps that were previously separated:\ - 1. Use `metadata_add...` functions where possible.\ - 2. Mutate new columns using `custom_R_code`.\ - 3. Manually map newly creted column names in the appropriate part of the `metadata.yml` file.\ + 1. Use `metadata_add...` functions where possible.\ + 2. Mutate new columns using `custom_R_code`.\ + 3. Manually map newly creted column names in the appropriate part of the `metadata.yml` file.\ #### **Create a metadata template** @@ -106,19 +106,19 @@ However, you also want to acknowledge the original data sources, those documente read_csv("data/tutorial_dataset_4/data.csv") %>% distinct(References) # A tibble: 11 × 1 -References - - 1 Barrett et al. 1996 - 2 Benyon et al. 1999 - 3 Bleby et al.2009 +References + + 1 Barrett et al. 1996 + 2 Benyon et al. 1999 + 3 Bleby et al.2009 4 Brodribb and Felid 2000 - 5 Brooksbank et al. 2011 - 6 Canham et al. 2009 - 7 Carter and White 2009 - 8 Cernusak et al. 2006 - 9 Choat et al. 2005 -10 Drake and Franks 2003 -11 Drake et al. 2011 + 5 Brooksbank et al. 2011 + 6 Canham et al. 2009 + 7 Carter and White 2009 + 8 Cernusak et al. 2006 + 9 Choat et al. 2005 +10 Drake and Franks 2003 +11 Drake et al. 2011 ``` You would now need to look up each of these references in the reference section of the manuscript and use Google Scholar (or another reference resource) to look up the doi for each reference.\To add additional references, you need to add an argument to the `metadata_add_source_doi` function:\ @@ -137,13 +137,13 @@ Note that AusTraits will build fine without adding all 11 original sources; it i #### **Map source_id into metadata file** {#source_ID} -- `source_id` is a metadata field that is used relatively infrequently. In the AusTraits trait database only 1/20 studies require the mapping of a `source_id` and therefore as a default the field does not get added to the metadata template.\ +- `source_id` is a metadata field that is used relatively infrequently. In the AusTraits trait database only 1/20 studies require the mapping of a `source_id` and therefore as a default the field does not get added to the metadata template.\ -- Instead, you have to manually add it to the files' `dataset` section, generally directly below `location_name`.\ +- Instead, you have to manually add it to the files' `dataset` section, generally directly below `location_name`.\ -- Simply add a line `source_id: source_id`. Make sure the indents line up with the fields above/below.\ +- Simply add a line `source_id: source_id`. Make sure the indents line up with the fields above/below.\ -- When the AusTraits team initially added this dataset, the curator manually added the `source_id` column. Otherwise such a column could be mutated from the reference column using `custom_R_code` or added manually in Excel.\ +- When the AusTraits team initially added this dataset, the curator manually added the `source_id` column. Otherwise such a column could be mutated from the reference column using `custom_R_code` or added manually in Excel.\ ------------------------------------------------------------------------ @@ -155,7 +155,7 @@ Although you could edit the data.csv file directly (and sometimes we do), you co ```{r, eval=FALSE} custom_R_code: ' - data %>% + data %>% mutate( location_name = paste0("lat_",Lat,"_long_",Long) ) @@ -164,12 +164,12 @@ Although you could edit the data.csv file directly (and sometimes we do), you co In this case you're using `custom_R_code` to generate many unique location names as a column in the data table as it is first read into the R workflow.  -*Note: While, for this example you are generating many unique location names, there are many datasets where all data have been collected at a single location, and therefore the submitted dataset doesn't include a `location_name` column. For all of those you simply add code like `mutate(location_name = "Broken Hill")` into `custom_R_code`.*\ +*Note: While, for this example you are generating many unique location names, there are many datasets where all data have been collected at a single location, and therefore the submitted dataset doesn't include a `location_name` column. For all of those you simply add code like `mutate(location_name = "Broken Hill")` into `custom_R_code`.*\ You next want to create a table of location names and location properties (i.e latitude & longitude):\ ```{r, eval=FALSE} -location_table <- +location_table <- metadata_check_custom_R_code("tutorial_dataset_4") %>% select(location_name, Lat, Long) %>% rename(`latitude (deg)` = Lat, `longitude (deg)` = Long) %>% @@ -178,9 +178,9 @@ location_table <- metadata_add_locations("tutorial_dataset_4", location_table) ``` -Notes:\ +Notes:\ -- Remember the function `metadata_check_custom_R_code` reads in the data.csv file, applies custom_R_code manipulations, and outputs the updated data table. This is very useful if you want to check your `custom_R_code` is performing as expected or if you want to perform further manipulations to the output.\ +- Remember the function `metadata_check_custom_R_code` reads in the data.csv file, applies custom_R_code manipulations, and outputs the updated data table. This is very useful if you want to check your `custom_R_code` is performing as expected or if you want to perform further manipulations to the output.\ - There are many other ways to create a table of location names and properties. You could create a standalone table using R (or Excel), but this solution generates no additional files to store.\ @@ -201,7 +201,7 @@ There is a single trait for this study, Huber value, which is the sapwood area t ```{r, eval=FALSE} custom_R_code: ' - data %>% + data %>% mutate( location_name = paste0("lat_",Lat,"_long_",Long), LA.SA = 10^(log_LA.SA) @@ -233,13 +233,13 @@ As with previous datasets, the following section has been added to `metadata.yml This trait has several "non-standard" values:\ -- **unit_in**: The units for the input column are leaf area/sapwood area, a dimensionless "area ratio". Meanwhile, Huber value is reported as sapwood area/leaf area, the inverse dimensionless "area ratio".\ +- **unit_in**: The units for the input column are leaf area/sapwood area, a dimensionless "area ratio". Meanwhile, Huber value is reported as sapwood area/leaf area, the inverse dimensionless "area ratio".\ + + The UCUM standard to which `traits.build` conforms specifies that "dimensionless" is only accepted for the very few traits that are truly dimensionless, not traits where units top and bottom simply cancel out. You need to specify that it is a ratio of area/area (or mass/mass, count/count, etc.)\ - The UCUM standard to which `traits.build` conforms specifies that "dimensionless" is only accepted for the very few traits that are truly dimensionless, not traits where units top and bottom simply cancel out. You need to specify that it is a ratio of area/area (or mass/mass, count/count, etc.)\ + Looking in the trait dictionary, you'll see that the units are specified as: `mm2{sapwood}/mm2{leaf}`, specifically to be explicit about which area is the denominator vs numerator. You therefore specify that the `units_in` are `mm2{leaf}/mm2{sapwood}`.\ - Looking in the trait dictionary, you'll see that the units are specified as: `mm2{sapwood}/mm2{leaf}`, specifically to be explicit about which area is the denominator vs numerator. You therefore specify that the `units_in` are `mm2{leaf}/mm2{sapwood}`.\ - - Since it is dimensionless, you could, of course specify any area units on top and bottom as long as they are identical, but I know `mm2{leaf}/mm2{sapwood}` is already in the unit conversions file.\ + Since it is dimensionless, you could, of course specify any area units on top and bottom as long as they are identical, but I know `mm2{leaf}/mm2{sapwood}` is already in the unit conversions file.\ - **entity_type**, **value_type**: Because this study is a compilation of many sources, the entity_type and value_type are not consistent across all measurements. Instead the data curator had to go back to many of the original sources and document which were population-level versus individual-level measurements, and correspondingly which were means vs raw values. For such circumstances (which also can occur within a single study), you can map a column name as the value.\ @@ -289,13 +289,13 @@ You are then led sequentially through the user prompts for each of the context p [4: method_context]{style="color:red;"}\ -This context is a method context, because it specifies a difference in methodology that might influence that trait value.\ +This context is a method context, because it specifies a difference in methodology that might influence that trait value.\ [The following values exist for this context: trunk sample branch sample]{style="color:blue;"}\ [Are replacement values required? (y/n)]{style="color:blue;"} [n]{style="color:red;"}\ [Are descriptions required? (y/n)]{style="color:blue;"} [y]{style="color:red;"}\ -The answers to the next two questions are up to the dataset curator, but at AusTraits we decided that `trunk sample` and `branch sample` were sufficiently explicit context property values, but that it would be helpful to add a description.\ +The answers to the next two questions are up to the dataset curator, but at AusTraits we decided that `trunk sample` and `branch sample` were sufficiently explicit context property values, but that it would be helpful to add a description.\ Therefore, we filled in `context_property: wood sample type` and added descriptions for the context property values:\ @@ -324,7 +324,7 @@ Again, the dataset curator may choose what information to document within the me Filling in that the `context property` is `tree height (m)`, the metadata file would simply be:\ -```{r, eval=FALSE} +```{r, eval=FALSE} - context_property: tree height (m) category: entity_context var_in: Height @@ -352,7 +352,7 @@ dataset_report("tutorial_dataset_4", traits.build_database, overwrite = TRUE) Overall, this report isn't very informative since it is the first Huber value dataset in the new database.\ -But let me draw your attention to the list of taxa at the bottom. Because the tutorials are, for now, ignoring taxon alignments (`traits.build` in a state of flux with regard to this), the tutorials have ignored this section.\ +But let me draw your attention to the list of taxa at the bottom. Because the tutorials are, for now, ignoring taxon alignments (`traits.build` in a state of flux with regard to this), the tutorials have ignored this section.\ But let me draw your attention to the list of taxa at the bottom. Because the tutorials are, for now, ignoring taxon alignments (`traits.build` in a state of flux with regard to this), the tutorials have ignored this section.  However, note the unknown taxa names `unk sp. 1` and `unk sp. 2`. Although the AusTraits database accepts names resolved to genus and family, data collected on a truly unknown taxon is useless and should be excluded. Being a terrestrial vascular plant database, we also exclude mosses and lichens that are sometimes in datasets. The curators for other databases aligned to the `traits.build` workflow will have their own standards for values to explicitly disallow, based on taxonomy (or some other variable). @@ -368,13 +368,13 @@ exclude_observations: .na Change this to: ```{r, eval=FALSE} -exclude_observations: +exclude_observations: - variable: taxon_name find: unk sp. 1, unk sp. 2 reason: omitting completely unknown taxa (E Wenk, 2023.09.20) ``` -Notes:\ +Notes:\ - Other variable names can also be used here. Perhaps there is a particular context value or location that is known to have problematic data. In AusTraits this field is almost exclusively used to exclude specific taxa, but the metadata section is designed to have broader applications. diff --git a/tutorial_dataset_5.qmd b/tutorial_dataset_5.qmd index 436cf0a..7ef974a 100644 --- a/tutorial_dataset_5.qmd +++ b/tutorial_dataset_5.qmd @@ -14,7 +14,7 @@ It is also recommended that you first work through some of the earlier tutorials - Learn how to add [measurement remarks](#measurement_remarks)  -### New Functions Introduced +### New functions introduced - none. @@ -94,7 +94,7 @@ For instance, in this dataset, the column `Mother` documents the maternal lineag ```{r, eval=FALSE} custom_R_code: ' - data %>% + data %>% mutate( measurement_remarks = paste0("maternal lineage ", Mother) ) @@ -109,7 +109,7 @@ There isn't a location name specified in the data.csv file, so use `custom_R_cod ```{r, eval=FALSE} custom_R_code: ' - data %>% + data %>% mutate( measurement_remarks = paste0("maternal lineage ", Mother), location = "Australian National University glasshouse" @@ -259,7 +259,7 @@ Since the trait metadata is read in long after the `custom_R_code` code is execu #### **Adding contributors** -The file `data/tutorial_dataset_5/raw/tutorial_dataset_5_notes.txt` indicates the main data_contributor for this study.\ +The file `data/tutorial_dataset_5/raw/tutorial_dataset_5_notes.txt` indicates the main data_contributor for this study.\ #### **Dataset fields** @@ -279,9 +279,9 @@ source("build.R") traits.build_database$excluded_data %>% filter(dataset_id == "tutorial_dataset_5") %>% View() ``` -There should be no errors.\ +There should be no errors.\ -There are a handful of excluded values, including both negative photosynthetic rates and negative conductance rates and two instances where `leaf_area = 0`. The `leaf_area = 0` values need to be removed using `custom_R_code`.\ +There are a handful of excluded values, including both negative photosynthetic rates and negative conductance rates and two instances where `leaf_area = 0`. The `leaf_area = 0` values need to be removed using `custom_R_code`.\ ```{r, eval=FALSE} mutate(across(c("area_mm2"), ~na_if(.x,0))) diff --git a/tutorial_dataset_6.qmd b/tutorial_dataset_6.qmd index 41c735f..9943baa 100644 --- a/tutorial_dataset_6.qmd +++ b/tutorial_dataset_6.qmd @@ -14,7 +14,7 @@ It is also recommended that you first work through some of the earlier tutorials - Learn how to add [individual_id's](#individual_id)  -### New Functions Introduced +### New functions introduced - none. @@ -92,7 +92,7 @@ In order for `repeat_measurements_id`'s to properly generate, it is essential to ```{r, eval=FALSE} custom_R_code: ' - data %>% + data %>% mutate( individual_id = paste(Site, Species, `Leaf number`, sep = "_") ) diff --git a/tutorial_dataset_7.qmd b/tutorial_dataset_7.qmd index 4177217..f589efd 100644 --- a/tutorial_dataset_7.qmd +++ b/tutorial_dataset_7.qmd @@ -14,7 +14,7 @@ It is also recommended that you first work through some of the earlier tutorials - Learn how to add [units from a column](#units_column)  -### New Functions Introduced +### New functions introduced - none. @@ -119,13 +119,13 @@ Then fill in the details for each trait column in the traits section of the meta | seed length minimum | seed_length | units | species | minimum | measurement | .na | -Notes: +Notes: - You may have noticed in the data.csv file that there is also a column `units`. For many long datasets there is a fixed unit for each trait, just as is standardly the case for wide datasets. In such cases fixed units values are mapped into the traits section of the metadata file, just as occurs with most wide datasets. In this dataset there is a column documenting the units, as different tax have leaf length and seed length reported in different units. The column for units can be mapped in at the trait level, as indicated here, or, for a long dataset, it could be mapped in a single time in the dataset section of the metadata, `units_in: units` and then you'd delete the line referring to `units_in` from each of the traits. -- There are two different trait names that refer to seed length, `seed length maximum` and `seed length minimum`. It is not a problem that these both map to the trait concept `seed_length` as they are different value types.\ +- There are two different trait names that refer to seed length, `seed length maximum` and `seed length minimum`. It is not a problem that these both map to the trait concept `seed_length` as they are different value types.\ -- Because these are species-level trait values, even the numeric traits do not have a replicate count. The range of values should represent all individuals of the species.\ +- Because these are species-level trait values, even the numeric traits do not have a replicate count. The range of values should represent all individuals of the species.\ #### **Adding contributors** @@ -133,7 +133,7 @@ The file `data/tutorial_dataset_7/raw/tutorial_dataset_7_notes.txt` indicates th #### **Dataset fields** -The file `data/tutorial_dataset_7/raw/tutorial_dataset_7_notes.txt` indicates how to fill in the `unknown` dataset fields for this study.\ +The file `data/tutorial_dataset_7/raw/tutorial_dataset_7_notes.txt` indicates how to fill in the `unknown` dataset fields for this study.\ ### Testing, error fixes, and report building {#exclude_data} @@ -149,13 +149,13 @@ source("build.R") traits.build_database$excluded_data %>% filter(dataset_id == "tutorial_dataset_7") %>% View() ``` -The excluded data includes four rows of data with the error `Unsupported trait value` for the trait `leaf_compoundness`. The term `article` does not describe a leaf's compoundness. As articles are always `simple` leaves you can add a substitution:\ +The excluded data includes four rows of data with the error `Unsupported trait value` for the trait `leaf_compoundness`. The term `article` does not describe a leaf's compoundness. As articles are always `simple` leaves you can add a substitution:\ ```{r, eval=FALSE} metadata_add_substitution(dataset_id = "tutorial_dataset_7", trait_name = "leaf_compoundness", find = "articles", replace = "simple") ``` -Then rebuild the database and again check excluded data to ensure the substitution has worked as intended.\ +Then rebuild the database and again check excluded data to ensure the substitution has worked as intended.\ ```{r, eval=FALSE} traits.build_database$build_info$version <- "5.0.0" # a fix because the function was built around specific AusTraits versions diff --git a/versioned_releases.qmd b/versioned_releases.qmd new file mode 100644 index 0000000..e87aeaa --- /dev/null +++ b/versioned_releases.qmd @@ -0,0 +1,39 @@ +# Version updating & making a new release + +Releases of your database are snapshots that are archived and available for use. + +We suggest semantic versioning to label your versions. As discussed in [Falster et al 2019](http://doi.org/10.1093/gigascience/giz035), semantic versioning can apply to datasets as well as code. + +The version number will have 3 components for actual releases, and 4 for development versions. The structure is `major.minor.patch.dev`, where `dev` is at least 9000. The `dev` component provides a visual signal that this is a development version. So, if the current version is 0.9.1.9000, the release will be be 0.9.2, 0.10.0 or 1.0.0. + +Our approach to incrementing version numbers is: + +- `major`: Increment when you make changes to the structure that are likely incompatible with any code written to work with previous versions. +- `minor`: Increment to communicate any changes to the structure that are likely to be compatible with any code written to work with the previous versions (i.e., allows code to run without error). Such changes might involve adding new data within the existing structure, so that the previous database version exists as a subset of the new version. For tabular data, this includes adding columns or rows. On the other hand, removing data should constitute a major version because records previously relied on may no longer exist. +- `patch`: Increment to communicate correction of errors in the actual data, without any changes to the structure. Such changes are unlikely to break or change analyses written with the previous version in a substantial way. + + + +**Figure:** Semantic versioning communicates to users the types of changes that have occurred between successive versions of an evolving dataset, using a tri-digit label where increments in a number indicate major, minor, and patch-level changes, respectively. From [Falster et al 2019](http://doi.org/10.1093/gigascience/giz035), (CC-BY). + +The process of making a release is as follows. Note that corresponding releases and versions are needed in both `austraits` and `traits.build`: #TODO - this is no longer true right? + +1. Update the version number in the DESCRIPTION file, using: + +```{r, eval=FALSE} +desc::desc_bump_version() # Specify `which` argument for type of version increment +``` + +2. Compile the database. + +3. Update documentation. + +4. Commit and push to GitHub (using PR workflow). + +5. Make a release on GitHub, adding version number. + +6. Prepare for the next version by updating version numbers. + +```{r, eval=FALSE} +desc::desc_bump_version("dev") +```