# Script by Carmen Segura to reproduce the main results and Figures 3 and 4 in: 

C. Segura, A.L. Neal, L. Castro-Sardiňa, P. Harris, M.J. Rivero, L.M. Cardenas, J.G.N. Irisarri,
Comparison of direct and indirect soil organic carbon prediction at farm field scale,
Journal of Environmental Management,
Volume 365,
2024,
121573,
ISSN 0301-4797,
https://doi.org/10.1016/j.jenvman.2024.121573.


# load packages

library(dplyr)
library(ggplot2)
library(patchwork)
library(Hmisc)
library(glmulti)    
library(lme4)
library(lmerTest)
library(MuMIn)
library(mgcv)
library(ggResidpanel)
library(relaimpo)
library(sjPlot)
library(flextable)
library(ggeffects)
library(gratia) 

#### 1. Read data ####

# setup your directory 

setwd("yourdirectory")

# read data .txt

ESPI.SOC<-read.table("data_source_Segura.txt", header=TRUE, stringsAsFactors=TRUE, sep="\t", na.strings="NA", dec=".", strip.white=TRUE)

# IMPORTANT: we have named some variables in the manuscript different from the .txt and from this code.

# variable in code and txt = variable in the manuscript
# MEAN.TC = SOC
# ESPI.MAR.OCT = ESPI
# TEMP.MAR.OCT = TEMP
# PPT.MAR.OCT = PPT
# plough.manag = management
# mow.early = mow.spring
# mow.late = mow.summer

#### 2. Prepare data ####

# Our data: new data frame when ESPI count proportion is > 0.8 and yr > 2011 (before 2012 we do not have soil data)

ESPI<-filter(ESPI.SOC,count.Prop.MAR.OCT.> 0.80 & yr >2011) 

# check structure

str(ESPI)

ESPI$yr<-as.factor(ESPI$yr)
ESPI$yrs.without<-as.numeric(ESPI$yrs.without )
ESPI$mow.late <-as.numeric(ESPI$mow.late)
ESPI$mow.early <-as.numeric(ESPI$mow.early)

# NAs

ESPI[ESPI==""]<-NA


#### 3. Modelling using a set of open- and non-open-source variables (Supplementary material section C) ####

# Explanatory variables: ESPI and all variables available from the NWFP related to land use and management, landscape, soils and climate

#### 3.1. Lineal model ####

# glmulti to fit best models, example with the definitive set of variables (see Supplementary material section C)

lm.A_all<-glmulti(MEAN.TC~ESPI.MAR.OCT+
                    yr +
                    plough.manag +
                    grazing +
                    veg.cover+
                    fert+
                    mow.early+
                    mow.late+
                    aspect.mean+
                    slope.mean+
                    soil.units+
                    TN+
                    pH+
                    TEMP.MAR.OCT+
                    PPT.MAR.OCT,
                    data=ESPI,
                    crit   = aicc,      
                    level  = 1,          
                    method = "h",        
                    family = gaussian, 
                    fitfunction = lm, 
                    confsetsize = 100)   


plot(lm.A_all)
print(lm.A_all)
plot(lm.A_all, type = "s")

# Information Criteria and the Akaike weights of our best models

weightable(lm.A_all)[1:6,] %>% 
  regulartable() %>%       
  autofit()



# Best linear model after checking all the models selected by glmulti (the same processess aply over the modelling) #

final.lm.all<-lm(MEAN.TC ~ plough.manag + soil.units + mow.late + TN + TEMP.MAR.OCT, data=ESPI)

summary(final.lm.all)
summary.aov(final.lm.all)
aicc(final.lm.all)

#diagnostic plots 

resid_panel(final.lm.all, plots = c("resid","qq","hist","lev"),type = NA, bins = 30,
            smoother = FALSE, qqline = TRUE, qqbands = FALSE, scale = 1,
            theme = "bw", axis.text.size = 12, title.text.size = 12,
            title.opt = TRUE, nrow = NULL)

# relative importance of factors

calc.relimp(final.lm.all, type = c("lmg"), rela = TRUE)

# Further exploration to detect non linear relationships

set_theme(base = theme_bw())  #to set ggplot theme in plot_model sjPlot
plot_model(final.lm.all, type = "resid") # Slope of coefficients for each single predictor against the residuals
plot_model(final.lm.all, type = "slope") # Slope of coefficients for each single predictor against the response

# model validation

#get predicted and observed values

pred_lm.all <- as.data.frame(predict.lm(final.lm.all, ESPI, se.fit=TRUE, type="response")) #fitted values
pred_lm.all$obs <- ESPI$MEAN.TC #observed values
pred_lm.all$manag <- ESPI$plough.manag #to plot observed values grouped by management

pred_lm.all%>%
  na.omit()

pred_lm.all<-filter(pred_lm.all,!obs=="na")

str(pred_lm.all)

# predicted vs observed and r

with(pred_lm.all,cor(obs, fit, method = c("pearson")))

# plot measured vs predicted grouped by management

lm.pred.vs.obs.manag <- ggplot(pred_lm.all, aes(x=obs, y=fit, color=manag)) +
  geom_point(shape=20, size = 3, stroke = 1) +
  geom_abline(intercept=0, slope=1) +
  ylab("SOC predicted")+
  scale_y_continuous(limits = c(2, 7)) +
  scale_x_continuous(limits = c(2, 7))+
  scale_color_manual(values = c("#E69F00","#AA4499","#44AA99"),labels = c("arable", "improved pasture", "permanent pasture"))+
  geom_smooth(method=lm , color="blue", fill="#69b3a2", se=TRUE) +
  theme_bw()+
  theme(text = element_text(size=12),
        legend.title = element_blank(),
        legend.position = c(.15, .91),
        legend.key = element_blank(),
        legend.text = element_text(size = 12),
        legend.background = element_rect(fill = "transparent")) +
  geom_segment(aes(x = 6.3, y = 2, xend = 6.7, yend = 2),colour="black") +
  annotate(geom="text", x=6.95, y=2.03, label="1:1", size=6)

lm.pred.vs.obs.manag

# RMSE:  to quantify prediction error, root mean squared error is the most commonly used measure of fit to test data

# function

rmse2 <- function(x, y, na.rm = TRUE){
  res <- sqrt(mean((x-y)^2, na.rm = na.rm))
  return(res)
}

# call to function               

rmse2(x = pred_lm.all$obs, y = pred_lm.all$fit)  # RMSE = 0.16 % SOC 

#to report % RMSE

mean.obser<-mean(pred_lm.all$obs)
mean.obser

rmse<-(0.1633583*(100/mean.obser))
rmse # RMSE(%)=3.52


## Code example to reproduce LM predictions plot

pred.TN <- ggpredict(final.lm.all, terms = c("TN","plough.manag"))
plot.TN<-plot(pred.TN, add.data = TRUE ) +
  labs(x="TN (%)",y="SOC (%)", title="a")+
  scale_color_manual(values = c("#E69F00","#AA4499","#44AA99"),labels = c("arable", "improved pasture", "permanent pasture"))+
  scale_fill_manual(values = c("#E69F00","#AA4499","#44AA99"))+
  coord_cartesian(ylim=c(2, 8)) + 
  scale_y_continuous(breaks=seq(2, 8,1))+
  theme_classic()+
  theme(text = element_text(size=12),
        legend.title = element_blank(),
        legend.position = c(.25, .91),
        legend.key = element_blank(),
        legend.text = element_text(size = 8),
        legend.background = element_rect(fill = "transparent"))


pred.manag <- ggpredict(final.lm.all, terms = c("plough.manag")) 
plot.manag<-plot(pred.manag, add.data = TRUE, colors = "darkgrey") +
  labs(x="Management",y="SOC (%)", title= "b")+
  theme_classic()+
  scale_x_discrete(limits=c("arable","improved p.","permanent p."))+
  coord_cartesian(ylim=c(2, 8)) + 
  scale_y_continuous(breaks=seq(2, 8,1))+
  theme(text = element_text(size=12),
        legend.title = element_blank(),
        legend.position = c(.15, .91),
        legend.key = element_blank(),
        legend.text = element_text(size = 8),
        legend.background = element_rect(fill = "transparent"))


pred.soil <- ggpredict(final.lm.all, terms = c("soil.units","plough.manag"))
plot.soil<-plot(pred.soil, add.data = TRUE) +
  labs(x="Soil units",y="SOC (%)", title="c")+
  scale_color_manual(values = c("#E69F00","#AA4499","#44AA99"),labels = c("arable", "improved pasture", "permanent pasture"))+
  coord_cartesian(ylim=c(2, 8)) + 
  scale_y_continuous(breaks=seq(2, 8,1))+
  theme_classic()+
  theme(text = element_text(size=12),
        legend.title = element_blank(),
        legend.position = c(.35, .91),
        legend.key = element_blank(),
        legend.text = element_text(size = 8),
        legend.background = element_rect(fill = "transparent"))


pred.TEMP <- ggpredict(final.lm.all, terms = c("TEMP.MAR.OCT","plough.manag"))
plot.TEMP<-plot(pred.TEMP, add.data = TRUE) +
  labs(x="TEMP (°C)",y="SOC (%)", title="d")+
  scale_color_manual(values = c("#E69F00","#AA4499","#44AA99"),labels = c("arable", "improved pasture", "permanent pasture"))+
  scale_fill_manual(values = c("#E69F00","#AA4499","#44AA99"))+
  scale_y_continuous(breaks=(seq(2,8,1)), limits = c(2,8)) + #when ylimits doesn't work include limi in scale
  coord_cartesian(xlim=c(11.5, 13.5)) + 
  scale_x_continuous(breaks=seq(11.5, 13.5,0.5))+
  theme_classic()


pred.mow <- ggpredict(final.lm.all, terms = c("mow.late","plough.manag"))
plot.mow<-plot(pred.mow, add.data = TRUE ) +
  labs(x="Mow summer events",y="SOC (%)", title="e")+
  scale_color_manual(values = c("#E69F00","#AA4499","#44AA99"),labels = c("arable", "improved pasture", "permanent pasture"))+
  scale_fill_manual(values = c("#E69F00","#AA4499","#44AA99"))+
  #coord_cartesian(ylim=c(2, 8)) + 
  scale_y_continuous(breaks=(seq(2, 8,1)),limits =c(2, 8) )+
  coord_cartesian(xlim=c(0, 2)) + 
  scale_x_continuous(breaks=seq(0, 2 ,1))+
  theme_classic()+
  theme(text = element_text(size=12),
        legend.title = element_blank(),
        legend.position = c(.85, .91),
        legend.key = element_blank(),
        legend.text = element_text(size = 8),
        legend.background = element_rect(fill = "transparent"))


(plot.TN | plot.manag | plot.soil + theme(legend.position="none")|plot.TEMP + theme(legend.position="none") |  plot.mow + theme(legend.position="none") ) #ordered by relative importance

#### 3.2. Linear mixed model (Field as random factor) ####

final.lmm.all<-lmer(MEAN.TC ~plough.manag + soil.units + mow.late + TN+ TEMP.MAR.OCT+ (1|Field.Name),data=ESPI)
summary(final.lmm.all)
anova(final.lmm.all) # we use F as proxy relative importance
aicc(final.lmm.all)

# diagnostic plots 

plot_model(final.lmm.all, type = "diag") 
plot_model(final.lmm.all, type = "re", show.values = F) # to visualize random effects

# explained variability by random factor 

r.squaredGLMM(final.lmm.all)

# Follow the examples in section 3.1  to reproduce LMM predictions plot and get RMSE

#### 3.3. Generalised additive model ####

final.gam.all<-gam(MEAN.TC ~plough.manag + soil.units + ESPI.MAR.OCT + TN + s(TEMP.MAR.OCT,k=6) , method = "REML",data=ESPI)
summary(final.gam.all)
anova(final.gam.all)
aiccc(final.gam.all)

# diagnostic plots

set.seed(100)
par(mfrow=c(2,2))
appraise(final.gam.all, method = "simulate") & theme_bw() 

concurvity(final.gam.all, full = TRUE)
concurvity(final.gam.all, full = FALSE)

# model without ESPI?

final.gam.all.b<-gam(MEAN.TC ~plough.manag + soil.units + TN + s(TEMP.MAR.OCT,k=6) , method = "REML",data=ESPI) 
summary(final.gam.all.b)
aicc(final.gam.all.b)

# diagnostic plots

set.seed(100)
par(mfrow=c(2,2))
appraise(final.gam.all.b, method = "simulate") & theme_bw() 

concurvity(final.gam.all.b, full = TRUE)
concurvity(final.gam.all.b, full = FALSE)

# which is a best model?

anova(final.gam.all.b,final.gam.all, test="F") # including ESPI does not improve the explanatory power of the model but p=0.0538, model diagnosis seems better with ESPI

# Partial effects of the selected model 

tn.gam<-draw(parametric_effects(final.gam.all,terms = "TN")) &
  labs(title= "") & 
  theme_bw() 

manag.gam<-draw(parametric_effects(final.gam.all,terms = "plough.manag")) & 
  labs(x="Management",y="Partial effect",title= "") & 
  theme_bw()

soil.gam<-draw(parametric_effects(final.gam.all,terms = "soil.units")) & 
  labs(x="Soil units",y="Partial effect",title= "") & 
  theme_bw()

ESPI.gam<-draw(parametric_effects(final.gam.all,terms = "ESPI.MAR.OCT")) & 
  labs(x="ESPI",y="Partial effect",title= "") & 
  theme_bw()

sTEMP<-draw(final.gam.all,residuals=TRUE,resid_col = "grey") & 
  labs(x="TEMP",y="Partial effect",title= "s(TEMP)") & theme_bw()

tn.gam + manag.gam + soil.gam + ESPI.gam + sTEMP + plot_layout(ncol = 2, nrow=3)

# model validation: measured vs predicted 

pred_gam.all <- as.data.frame(predict.gam(final.gam.all, ESPI, se.fit=TRUE, type="response")) #fit values
pred_gam.all$obs <- ESPI$MEAN.TC #observed values
pred_gam.all$manag <- ESPI$plough.manag 

pred_gam.all

pred_gam.all%>%
  na.omit()

pred_gam.all<-filter(pred_gam.all,!obs=="na")

str(pred_gam.all)

#coefficient of determination r

with(pred_gam.all,cor(obs, fit, method = c("pearson")))

#measured vs predicted grouped by management
gam.pred.vs.obs.manag <- ggplot(pred_gam.all, aes(x=obs, y=fit, color=manag)) +
  geom_point(shape=20, size = 3, stroke = 1) +
  geom_abline(intercept=0, slope=1) +
  xlab("SOC measured") + 
  ylab("SOC fitted")+
  scale_y_continuous(limits = c(2, 7)) +
  scale_x_continuous(limits = c(2, 7))+
  scale_color_manual(values = c("#E69F00","#AA4499","#44AA99"),labels = c("arable", "improved pasture", "permanent pasture"))+
  geom_smooth(method=lm , color="red", fill="#69b3a2", se=TRUE) +
  theme_bw()+
  theme(text = element_text(size=12),
        legend.title = element_blank(),
        legend.position = c(.15, .91),
        legend.key = element_rect(fill = NA),
        legend.text = element_text(size = 12),
        legend.background = element_rect(fill = "transparent"))+
  geom_segment(aes(x = 6.3, y = 2, xend = 6.7, yend = 2),colour="black") +
  annotate(geom="text", x=6.95, y=2.03, label="1:1", size=6)

# RMSE
# function

rmse2 <- function(x, y, na.rm = TRUE){
  res <- sqrt(mean((x-y)^2, na.rm = na.rm))
  return(res)
}

# call to function

rmse2(x = pred_gam.all$obs, y = pred_gam.all$fit)  #0.1325251


#with eq to report as %

mean.obser<-mean(pred_gam.all$obs)
mean.obser

rmse<-(0.1325251*(100/mean.obser))
rmse #2.855949

# Cross validation for final.gam.all model (based on http://www.sthda.com/english/articles/38-regression-model-validation/157-cross-validation-essentials-in-r/ )

scores <- data.frame() #Create empty df to store CV R2 values
devian.expl<-data.frame()
set.seed(40) #Set seed for reproducibility
k <- 10 #Define number of folds

# For loop for each fold
for (i in 1:k) {
  
  # Create weighting column.
  
  ESPI$weight <- sample(c(0,1),size=nrow(ESPI),replace=TRUE,prob=c(0.2,0.8)) #0 Indicates testing sample, 1 training sample. Split into training and test sample (80/20 ratio)
  ESPI$weight
  
  # Generate training dataset
  
  trainingdata <- ESPI[ESPI$weight == 1, ] #Select test data by weight  
  trainingdata%>%
    na.omit()
  
  # Generate test dataset
  
  testdata <- ESPI[ESPI$weight == 0, ] #Select test data by weight
  testdata%>%
    na.omit()
  
  # Run GAM with training data 
  
  m.training.nonopen<-gam(MEAN.TC ~plough.manag + soil.units +ESPI.MAR.OCT + TN + s(TEMP.MAR.OCT,k=6),method = "REML",data = trainingdata) 
  summary(m.training.nonopen)
  
  # Predict using test data
  
  pred <- predict.gam(m.training.nonopen,newdata = testdata)
  pred
  
  #Extract scores

  scores[i,1] <- summary(m.training.nonopen)$r.sq #this is the adjusted because gam summary doesn't provide R-sq 
  devian.expl[i,1]<- summary(m.training.nonopen)$dev.expl
}

scores
devian.expl

#Get average scores from each k-fold test

av.r2.GMRF <- mean(scores$V1)
av.r2.GMRF
av.dev.GMRF <- mean(devian.expl$V1)
av.dev.GMRF

# print scores
folds<-c(1,2,3,4,5,6,7,8,9,10)
scores2 <- cbind(scores, folds)
dev<-cbind(devian.expl,folds)

# plot

cv.nonopen.r<-ggplot(data=scores2, aes(x=folds, y=V1, group=1)) +
  geom_boxplot(color="#00b2ca") +
  geom_point(size=4, color="#00b2ca") +
  labs(y=expression(Radj^2),x="folds")+
  scale_y_continuous(breaks=seq(0.97,1.00,0.005))+
  scale_x_continuous(breaks=seq(1, 10,1))+
  annotate(geom="text", x=8, y=0.99, label= expression(model~Radj^2~0.98), size=4)+
  theme_classic()

# Code to reproduce GAM predictions plot

pred.TN.gam <- ggpredict(final.gam.all, terms = c("TN","plough.manag"))
plot.TN.gam<-plot(pred.TN.gam, add.data = TRUE ) +
  labs(x="TN (%)",y="SOC (%)", title="a")+
  scale_color_manual(values = c("#E69F00","#AA4499","#44AA99"),labels = c("arable", "improved pasture", "permanent pasture"))+
  scale_fill_manual(values = c("#E69F00","#AA4499","#44AA99"))+
  coord_cartesian(ylim=c(2, 8)) + 
  scale_y_continuous(breaks=seq(2, 8,1))+
  theme_classic()+
  theme(text = element_text(size=12),
        legend.title = element_blank(),
        legend.position = c(.25, .91),
        legend.key = element_blank(),
        legend.text = element_text(size = 8),
        legend.background = element_rect(fill = "transparent"))+
  annotate(geom="text", x=0.7, y=8, label="F=1556.9", size=4)

pred.TEMP.gam <- ggpredict(final.gam.all, terms = c("TEMP.MAR.OCT","plough.manag"))
plot.TEMP.gam<-plot(pred.TEMP.gam, add.data = TRUE) +
  theme_bw() +
  labs(x="TEMP (°C)",y="SOC (%)", title="b")+
  scale_color_manual(values = c("#E69F00","#AA4499","#44AA99"),labels = c("arable", "improved pasture", "permanent pasture"))+
  scale_fill_manual(values = c("#E69F00","#AA4499","#44AA99"))+
  scale_y_continuous(breaks =(seq(2, 8,1)),limits =c(2, 8))+
  coord_cartesian(xlim=c(11.5, 13.3)) + 
  scale_x_continuous(breaks=seq(11.5, 13.3,0.3))+
  theme_classic()+
  theme(text = element_text(size=12),
        legend.title = element_blank(),
        legend.position = c(.45, .91),
        legend.key = element_blank(),
        legend.text = element_text(size = 8),
        legend.background = element_rect(fill = "transparent"))+
  annotate(geom="text", x=12.9, y=7.6, label="edf = 3.1", size=4)+
  annotate(geom="text", x=12.9, y=8, label="F=67.8", size=4)

pred.soil.gam <- ggpredict(final.gam.all, terms = c("soil.units","plough.manag"))
plot.soil.gam<-plot(pred.soil.gam, add.data = TRUE) +
  labs(x="Soil units",y="SOC (%)", title="c")+
  scale_color_manual(values = c("#E69F00","#AA4499","#44AA99"),labels = c("arable", "improved pasture", "permanent pasture"))+
  scale_fill_manual(values = c("#E69F00","#AA4499","#44AA99"))+
  coord_cartesian(ylim=c(2, 8)) + 
  scale_y_continuous(breaks=seq(2, 8,1))+
  theme_classic()+
  theme(text = element_text(size=12),
        legend.title = element_blank(),
        legend.position = c(.35, .91),
        legend.key = element_blank(),
        legend.text = element_text(size = 8),
        legend.background = element_rect(fill = "transparent"))+
  annotate(geom="text", x=5, y=8, label="F=15.7", size=4)

pred.manag.gam <- ggpredict(final.gam.all, terms = c("plough.manag"))  
plot.manag.gam<-plot(pred.manag.gam, add.data = TRUE, colors = "darkgrey") +
  labs(x="Management",y="SOC (%)", title= "d")+
  theme_classic()+
  scale_x_discrete(limits=c("arable","improved p.","permanent p."))+
  scale_fill_manual(values = c("#E69F00","#AA4499","#44AA99"))+
  coord_cartesian(ylim=c(2, 8)) + 
  scale_y_continuous(breaks=seq(2, 8,1))+
  theme(text = element_text(size=12),
        legend.title = element_blank(),
        legend.position = c(.15, .91),
        legend.key = element_blank(),
        legend.text = element_text(size = 8),
        legend.background = element_rect(fill = "transparent"))+
  annotate(geom="text", x=3, y=8, label="F=7.0", size=4) #p=0.0015

pred.ESPI.gam <- ggpredict(final.gam.all, terms = c("ESPI.MAR.OCT","plough.manag"))
plot.ESPI.gam<-plot(pred.ESPI.gam, add.data = TRUE) +
  theme_bw() +
  labs(x=expression(ESPI~(MJ~m^-2 ~d^-1)),y="SOC (%)", title="e") +
  scale_color_manual(values = c("#E69F00","#AA4499","#44AA99"),labels = c("arable", "improved pasture", "permanent pasture"))+
  scale_fill_manual(values = c("#E69F00","#AA4499","#44AA99"))+
  scale_y_continuous(breaks =(seq(2, 8,1)),limits =c(2, 8))+
  coord_cartesian(xlim=c(0, 5)) + 
  scale_x_continuous(breaks=seq(0, 5,1))+
  theme_classic()+
  theme(text = element_text(size=12),
        legend.title = element_blank(),
        legend.position = c(.45, .91),
        legend.key = element_blank(),
        legend.text = element_text(size = 8),
        legend.background = element_rect(fill = "transparent"))+
  annotate(geom="text", x=4, y=8, label="F=3.6", size=4)

plot.ESPI.gam

(plot.TN.gam |plot.TEMP.gam + theme(legend.position="none") | plot.soil.gam + theme(legend.position="none")|plot.manag.gam + theme(legend.position="none") | plot.ESPI.gam + theme(legend.position="none")) #ordered by relative importance

#### 3.4. Generalised additive mixed model (Field as random factor) ####

final.gamm.all<-gam(MEAN.TC ~plough.manag + soil.units +ESPI.MAR.OCT + TN + s(TEMP.MAR.OCT,k=6) + s(Field.Name, bs="re") , method = "REML",data=ESPI) 
summary(final.gamm.all)
aicc(final.gamm.all)

# diagnostic plot

par(mfrow=c(2,2))
gam.check(final.gamm.all) 
appraise(final.gamm.all, method = "simulate") & theme_bw()

concurvity(final.gamm.all, full = TRUE) #concurvity issues when add random factor is a common problem when you have a covariate as a random factor, and you have other factors that are function of space 
concurvity(final.gamm.all, full = FALSE) 

# Partial effects

tn.gamm<-draw(parametric_effects(final.gamm.all,terms = "TN")) &
  labs(title= "") & 
  theme_bw()

manag.gamm<-draw(parametric_effects(final.gamm.all,terms = "plough.manag")) & 
  labs(x="Management",y="Partial effect",title= "") & 
  theme_bw()

soil.gamm<-draw(parametric_effects(final.gamm.all,terms = "soil.units")) & 
  labs(x="Soil units",y="Partial effect",title= "") & 
  theme_bw()

ESPI.gamm<-draw(parametric_effects(final.gam.all,terms = "ESPI.MAR.OCT")) & 
  labs(x="ESPI",y="Partial effect",title= "") & 
  theme_bw()

sterms<-draw(final.gamm.all,residuals=TRUE,resid_col = "grey") & 
  theme_bw()

sterms<-draw(final.gamm.all,residuals=TRUE,resid_col = "grey",terms = "TEMP.MAR.OCT") & 
  labs(x="TEMP",y="Partial effect",title= "") & 
  theme_bw()

tn.gamm + manag.gamm + soil.gamm + ESPI.gamm + sterms + plot_layout(ncol = 2, nrow=3)

# model validation: get predicted and observed values, r, RMSE and prediction plots following the script 3.1 as example

#### 4. Modelling using a set of open- and non-open-source variables excluding TN (Supplementary material section D) ####

#### 4.1. Linear model ####

# Best model

final.lm.allb<-lm(MEAN.TC ~ plough.manag + soil.units + ESPI.MAR.OCT + aspect.mean + TEMP.MAR.OCT, data=ESPI)  #15/03/2023 LM selected model without TN was lmD.3, we call it final.lm.allb

summary(final.lm.allb)
summary.aov(final.lm.allb)


# diagnostic plots

resid_panel(final.lm.allb, plots = c("resid","qq","hist","lev"),type = NA, bins = 30,
            smoother = FALSE, qqline = TRUE, qqbands = FALSE, scale = 1,
            theme = "bw", axis.text.size = 12, title.text.size = 12,
            title.opt = TRUE, nrow = NULL)

# relative importance of variables

calc.relimp(final.lm.allb, type = c("lmg"), rela = TRUE)

# further exploration

set_theme(base = theme_bw())
plot_model(final.lm.allb, type = "resid")
plot_model(final.lm.allb, type = "slope")

# model validation: get predicted and observed values, r, RMSE and prediction plots following the script 3.1 as example

#### 4.2. Linear mixed model (Field as random factor) ####

# Best model

final.lmm.allb<-lmer(MEAN.TC ~plough.manag + soil.units + ESPI.MAR.OCT +  TEMP.MAR.OCT+ (1|Field.Name),data=ESPI) 

summary.aov(final.lmm.allb)

# diagnostic plots and further exploration

set_theme(base = theme_bw()) #to theme bw as other plots
plot_model(final.lmm.allb, type = "diag") 
plot_model(final.lmm.allb, type = "re", show.values = F) #random effects

# explained variability by random factor with library(MuMIn)

r.squaredGLMM(final.lmm.allb)

# model validation: get predicted and observed values, r, RMSE and prediction plots following the script 3.1 as example

#### 4.3. Generalised additive model ####

# GAM: we have two alternatives for the final model that are equivalents

# GAM 1: with TEMP

final1.gam.allb<-gam(MEAN.TC ~plough.manag + soil.units + s(aspect.mean,k=6) + s(TEMP.MAR.OCT,k=6), method = "REML",data=ESPI)
summary(final1.gam.allb)
anova(final1.gam.allb)

# diagnostic plots

par(mfrow=c(2,2))
gam.check(final1.gam.allb)
DIAGNgamb.1<-appraise(final1.gam.allb, method = "simulate") & theme_bw()

# Partial effects

manag.gam.b.1<-draw(parametric_effects(final1.gam.allb,terms = "plough.manag")) & 
  labs(x="Management",y="Partial effect",title= "") & 
  theme_bw()

soil.gam.b.1<-draw(parametric_effects(final1.gam.allb,terms = "soil.units")) & 
  labs(x="Soil units",y="Partial effect",title= "") & 
  theme_bw()

aspect.gam.b.1<-draw(final1.gam.allb,terms = "aspect.mean") & 
  labs(x="Aspect",y="Partial effect",title= "") & 
  theme_bw()

temp.gam.b.1<-draw(final1.gam.allb,terms = "TEMP.MAR.OCT") & 
  labs(x="TEMP",y="Partial effect",title= "") & 
  theme_bw()

s.asp.temp.final1.b<-draw(final1.gam.allb,residuals=TRUE,resid_col = "grey") & 
  theme_bw()

manag.gam.b.1 + soil.gam.b.1 + s.asp.temp.final1.b + plot_layout(ncol = 1, nrow=3)

# model validation: get predicted values vs measured

pred_gam.allb.1 <- as.data.frame(predict.gam(final1.gam.allb, ESPI, se.fit=TRUE, type="response"))
pred_gam.allb.1$obs <- ESPI$MEAN.TC 
pred_gam.allb.1$manag <- ESPI$plough.manag 

pred_gam.allb.1

pred_gam.allb.1%>%
  na.omit()

pred_gam.allb.1<-filter(pred_gam.allb.1,!obs=="na")

# coefficient of determination

with(pred_gam.allb.1,cor(obs, fit, method = c("pearson")))

# measured vs predicted grouped by management
gam.pred.vs.obs.managb1 <- ggplot(pred_gam.allb.1, aes(x=obs, y=fit, color=manag)) +
  geom_point(shape=20, size = 3, stroke = 1) +
  geom_abline(intercept=0, slope=1) +
  xlab("SOC measured") + 
  ylab("SOC fitted")+
  scale_y_continuous(limits = c(2, 7)) +
  scale_x_continuous(limits = c(2, 7))+
  scale_color_manual(values = c("#E69F00","#AA4499","#44AA99"),labels = c("arable", "improved pasture", "permanent pasture"))+
  geom_smooth(method=lm , color="red", fill="#69b3a2", se=TRUE) +
  theme_bw()+
  theme(text = element_text(size=12),
        legend.title = element_blank(),
        legend.position = c(.15, .91),
        legend.key = element_rect(fill = NA),
        legend.text = element_text(size = 12),
        legend.background = element_rect(fill = "transparent"))+
  geom_segment(aes(x = 6.3, y = 2, xend = 6.7, yend = 2),colour="black") +
  annotate(geom="text", x=6.95, y=2.03, label="1:1", size=6)

gam.pred.vs.obs.managb1

#cross validation

CVgam(formula=MEAN.TC ~plough.manag + soil.units + s(aspect.mean,k=6) + s(TEMP.MAR.OCT,k=6), data=ESPI, nfold = 10, debug.level = 0, method = "REML",
      printit = TRUE, cvparts = NULL, gamma = 1, seed = 100)


scores <- data.frame()
devian.expl<-data.frame()
set.seed(40) 
k <- 10 

# For loop for each fold
for (i in 1:k) {
  
  # Create weighting column.
  ESPI$weight <- sample(c(0,1),size=nrow(ESPI),replace=TRUE,prob=c(0.2,0.8)) 

  
  # Generate training dataseT
  trainingdata <- ESPI[ESPI$weight == 1, ] 
  
  # Generate test dataset
  testdata <- ESPI[ESPI$weight == 0, ] 
  
  # Run GAM with training data 
  m.training.nonopenb1<-gam(MEAN.TC ~plough.manag + soil.units + s(aspect.mean,k=6) + s(TEMP.MAR.OCT,k=6),method = "REML",data = trainingdata) 
  summary(m.training.nonopenb1)

  # Predict using test data
  pred <- predict.gam(m.training.nonopenb1,newdata = testdata)

  # Extract
  scores[i,1] <- summary(m.training.nonopenb1)$r.sq
  devian.expl[i,1]<- summary(m.training.nonopenb1)$dev.expl
}

scores
devian.expl


# Get average scores from each k-fold test

av.r2.GMRF <- mean(scores$V1) #0.8205425
av.r2.GMRF
av.dev.GMRF <- mean(devian.expl$V1) #0.8494907
av.dev.GMRF


# print scores
folds<-c(1,2,3,4,5,6,7,8,9,10)
scores2 <- cbind(scores, folds)
dev<-cbind(devian.expl,folds)

cv.nonopenb1.r<-ggplot(data=scores2, aes(x=folds, y=V1, group=1)) +
  geom_boxplot(color="#00b2ca") +
  geom_point() +
  ylab('R-sq.(adj)')+
  scale_y_continuous(breaks=seq(0.80,0.85,0.01))+
  scale_x_continuous(breaks=seq(1, 10,1))+
  annotate(geom="text", x=8, y=0.85, label="model R-sq.(adj)=0.82", size=4)+
  theme_classic()
cv.nonopenb1.r

#### Reproduce Figure 3 ####

pred.manag.gamb.1 <- ggpredict(final1.gam.allb, terms = c("plough.manag"))  
plot.manag.gamb.1<-plot(pred.manag.gamb.1, add.data = TRUE, colors = "darkgrey") +
  labs(x="Management",y="SOC (%)", title= "a")+
  theme_classic()+
  scale_x_discrete(limits=c("arable","improved p.","permanent p."))+
  scale_y_continuous(limits = c(0, 8)) +
  theme(text = element_text(size=12),
        legend.title = element_blank(),
        legend.position = c(.15, .91),
        legend.key = element_blank(),
        legend.text = element_text(size = 8),
        legend.background = element_rect(fill = "transparent"))+
  annotate(geom="text", x=3, y=8, label="F=117.4", size=4)

pred.TEMP.gamb.1 <- ggpredict(final1.gam.allb, terms = c("TEMP.MAR.OCT","plough.manag"))
plot.TEMP.gamb.1<-plot(pred.TEMP.gamb.1, add.data = TRUE) +
  labs(x="TEMP (°C)",y="SOC (%)", title="b")+
  scale_color_manual(values = c("#E69F00","#AA4499","#44AA99"),labels = c("arable", "improved pasture", "permanent pasture"))+
  scale_fill_manual(values = c("#E69F00","#AA4499","#44AA99"))+
  scale_y_continuous(limits = c(0, 8)) +
  coord_cartesian(xlim=c(11.5, 13.3)) + 
  scale_x_continuous(breaks=seq(11.5, 13.3,0.3))+
  theme_classic()+
  theme(text = element_text(size=12),
        legend.title = element_blank(),
        legend.position = c(.25, .91),
        legend.key = element_blank(),
        legend.text = element_text(size = 8),
        legend.background = element_rect(fill = "transparent"))+
  annotate(geom="text", x=13, y=8, label="F=28.0", size=4)+ #, p< 2e-16
  annotate(geom="text", x=13, y=7.45, label="edf=4.4", size=4)

pred.soil.gamb.1 <- ggpredict(final1.gam.allb, terms = c("soil.units","plough.manag")) 
plot.soil.gamb.1<-plot(pred.soil.gamb.1, add.data = TRUE) +
  labs(x="Soil units",y="SOC (%)", title="c")+
  scale_color_manual(values = c("#E69F00","#AA4499","#44AA99"),labels = c("arable", "improved pasture", "permanent pasture"))+
  scale_y_continuous(limits = c(0, 8)) +
  theme_classic()+
  theme(text = element_text(size=12),
        legend.title = element_blank(),
        legend.position = c(.25, 1),
        legend.key = element_blank(),
        legend.text = element_text(size = 8),
        legend.background = element_rect(fill = "transparent"))+
  annotate(geom="text", x=5.3, y=8, label="F=9.2", size=4)

pred.asp.gamb.1 <- ggpredict(final1.gam.allb, terms = c("aspect.mean","plough.manag"))
plot.asp.gamb.1<-plot(pred.asp.gamb.1, add.data = TRUE) +
  labs(x="Aspect (°)",y="SOC (%)", title="d")+
  scale_color_manual(values = c("#E69F00","#AA4499","#44AA99"),labels = c("arable", "improved pasture", "permanent pasture"))+
  scale_fill_manual(values = c("#E69F00","#AA4499","#44AA99"))+
  scale_y_continuous(limits = c(0, 8)) +
  coord_cartesian(xlim=c(67.5, 337.5)) + 
  scale_x_continuous(breaks=seq(67.5, 337.5,45))+
  theme_classic()+
  theme(text = element_text(size=12),
        legend.title = element_blank(),
        legend.position = c(.45, .91),
        legend.key = element_blank(),
        legend.text = element_text(size = 8),
        legend.background = element_rect(fill = "transparent"))+
  annotate(geom="text", x=270, y=7.45, label="edf=4.1", size=4)+
  annotate(geom="text", x=270, y=8, label="F=8.0", size=4)+
  annotate(geom="text", x=90, y=0, label="E", size=4)+
  annotate(geom="text", x=135, y=0, label="SE", size=4)+
  annotate(geom="text", x=180, y=0, label="S", size=4)+
  annotate(geom="text", x=225, y=0, label="SW", size=4)+
  annotate(geom="text", x=270, y=0, label="W", size=4)+
  annotate(geom="text", x=315, y=0, label="NW", size=4)

(plot.manag.gamb.1 | plot.TEMP.gamb.1  | plot.soil.gamb.1 + theme(legend.position="none") | plot.asp.gamb.1 + theme(legend.position="none")) 

# GAM final 2: with PPT (in Supplementary material)

final2.gam.allb<-gam(MEAN.TC ~plough.manag + soil.units + s(aspect.mean,k=6) + s(PPT.MAR.OCT,k=6), method = "REML",data=ESPI) 
summary(final2.gam.allb)
anova(final2.gam.allb)
AICc(final2.gam.allb)

# diagnostic plot

par(mfrow=c(2,2))
DIAGNgamb.2<-appraise(final2.gam.allb, method = "simulate")& theme_bw()

# Partial effects

manag.gam.b.2<-draw(parametric_effects(final2.gam.allb,terms = "plough.manag")) & 
  labs(x="Management",y="Partial effect",title= "") & 
  theme_bw()

soil.gam.b.2<-draw(parametric_effects(final2.gam.allb,terms = "soil.units")) & 
  labs(x="Soil units",y="Partial effect",title= "") & 
  theme_bw()

aspect.gam.b.2<-draw(final2.gam.allb,terms = "aspect.mean") & 
  labs(x="Aspect",y="Partial effect",title= "") & 
  theme_bw()

temp.gam.b.2<-draw(final2.gam.allb,terms = "PPT.MAR.OCT") & 
  labs(x="PPT",y="Partial effect",title= "") & 
  theme_bw()


s.asp.temp.final2.b<-draw(final2.gam.allb,residuals=TRUE,resid_col = "grey") & 
  theme_bw()

manag.gam.b.2 + soil.gam.b.2+ s.asp.temp.final2.b + plot_layout(ncol = 1, nrow=3)

# follow scripts examples in 4.3 to reproduce the model validation plots

#### 4.4. Generalised additive mixed model (Field as random factor) ####

# with TEMP

final1.gamm.allb<-gam(MEAN.TC ~plough.manag + soil.units + s(TEMP.MAR.OCT,k=6) + s(Field.Name, bs="re") , method = "REML",data=ESPI)

# with PPT

final2.gamm.allb<-gam(MEAN.TC ~plough.manag + soil.units + s(PPT.MAR.OCT,k=6) + s(Field.Name, bs="re") , method = "REML",data=ESPI)

# follow scripts examples to reproduce the model diagnostic,validation and prediction plots

#### 5. Modelling using a set of open-source variables (Supplementary material section E) ####

#### 5.1. Linear model ####

final.lm.open<- lm(MEAN.TC ~land.cover + ESPI.MAR.OCT + aspect.mean + slope.mean,data = ESPI)
summary(final.lm.open)
summary.aov(final.lm.open)
anova(final.lm.open)
AICc(final.lm.open)

# relative importance of variables
calc.relimp(final.lm.open, type = c("lmg"), rela = TRUE)

# diagnostic plots

resid_panel(final.lm.open, plots = c("resid","qq","hist","lev"),type = NA, bins = 30,
            smoother = FALSE, qqline = TRUE, qqbands = FALSE, scale = 1,
            theme = "bw", axis.text.size = 12, title.text.size = 12,
            title.opt = TRUE, nrow = NULL)

set_theme(base = theme_bw())
plot_model(final.lm.open, type = "resid") 
plot_model(final.lm.open, type = "slope")

# model validation: get predicted and observed values, r, RMSE and prediction plots following the script 3.1 as example

#### 5.2. Linear mixed models ####

# LMM with land cover

final_lmer_Open1<- lmer(MEAN.TC ~land.cover + slope.mean + PPT.MAR.OCT + (1|Field.Name),data = ESPI) # final_lmm_open 1
summary(final_lmer_Open1)
anova(final_lmer_Open1) 
aicc(final_lmer_Open1)
r.squaredGLMM(final_lmer_Open1)


# diagnostic plots

set_theme(base = theme_bw())
par(mfrow=c(2,2))
plot_model(final_lmer_Open1, type = "diag") 
plot_model(final_lmer_Open1, type = "re", show.values = F)

# LMM with ESPI

final_lmer_Open2<- lmer(MEAN.TC ~ ESPI.MAR.OCT + slope.mean + PPT.MAR.OCT + (1|Field.Name),data = ESPI) #ESPI instead of land cover
summary(final_lmer_Open2)
anova(final_lmer_Open2) 
aicc(final_lmer_Open1)
r.squaredGLMM(final_lmer_Open2)

# diagnostic plots 

set_theme(base = theme_bw())
par(mfrow=c(2,2))
plot_model(final_lmer_Open2, type = "diag") 
plot_model(final_lmer_Open2, type = "re", show.values = F)

#### 5.3. Generalised additive model ####

# Best final model GAM

final.gam.open<- gam(MEAN.TC ~ s(ESPI.MAR.OCT,k=6)  +s(aspect.mean, k=6)+ s(slope.mean,k=8),method = "REML",data = ESPI)  
summary(final.gam.open)
anova(final.gam.open)

par(mfrow=c(2,2))
appraise(final.gam.open, method = "simulate")& theme_bw()
gam.check(final.gam.open)

concurvity(final.gam.open, full = TRUE) 
concurvity(final.gam.open, full = FALSE) 

# Partial effects

s.open<-draw(final.gam.open,residuals=TRUE,resid_col = "grey") & 
  theme_bw()

# model validation: get predicted values and RMSE

pred_gam.open <- as.data.frame(predict.gam(final.gam.open, ESPI, se.fit=TRUE, type="response"))
pred_gam.open$obs <- ESPI$MEAN.TC
pred_gam.open$cover <- ESPI$land.cover

pred_gam.open%>%
  na.omit()

pred_gam.open<-filter(pred_gam.open,!obs=="na")

str(pred_gam.open)

# measured vs predicted grouped by cover 

open.gam.pred.vs.obs.cover <- ggplot(pred_gam.open, aes(x=obs, y=fit, color=cover)) +
  geom_point(shape=20, size = 2, stroke = 1) +
  geom_abline(intercept=0, slope=1) +
  xlab("SOC measured") + 
  ylab("SOC fitted")+
  scale_y_continuous(limits = c(2, 7)) +
  scale_x_continuous(limits = c(2, 7))+
  scale_color_manual(values = c("#E69F00","#117733"),labels = c("arable", "grasslands"))+
  geom_smooth(method=lm , color="red", fill="#69b3a2", se=TRUE) +
  theme_bw()+
  theme(text = element_text(size=12),
        legend.title = element_blank(),
        legend.position = c(.15, .91),
        legend.key = element_rect(fill = NA),
        legend.text = element_text(size = 12),
        legend.background = element_rect(fill = "transparent")) +
  geom_segment(aes(x = 6.3, y = 2, xend = 6.7, yend = 2),colour="black") +
  annotate(geom="text", x=6.95, y=2.03, label="1:1", size=6)
open.gam.pred.vs.obs.cover 

#coeficient of determination r
with(pred_gam.open,cor(obs, fit, method = c("pearson")))

#RMSE
# function

rmse2 <- function(x, y, na.rm = TRUE){
  res <- sqrt(mean((x-y)^2, na.rm = na.rm))
  return(res)
}

# call to function               
 
rmse2(x = pred_gam.open$obs, y = pred_gam.open$fit)

# with eq to report as %

mean.obser<-mean(pred_gam.open$obs)
mean.obser

rmse #13.07%

rmse<-(0.6052815 *(100/mean.obser)) 
rmse #13.04397%

# cross validation 

scores <- data.frame()
devian.expl<-data.frame()
set.seed(40) 
k <- 10 

# For loop for each fold
for (i in 1:k) {
  
  # Create weighting column.
  
  ESPI$weight <- sample(c(0,1),size=nrow(ESPI),replace=TRUE,prob=c(0.2,0.8))
  ESPI$weight
  
  # Generate training dataset
  trainingdata <- ESPI[ESPI$weight == 1, ] 
  
  # Generate test dataset
  testdata <- ESPI[ESPI$weight == 0, ]
  
  # Run GAM with training data 
  m.training<-gam(MEAN.TC ~ s(ESPI.MAR.OCT,k=6)  +s(aspect.mean, k=6)+ s(slope.mean,k=8),method = "REML",data = trainingdata) 
  summary(m.training)

  
  # Predict using test data
  pred <- predict.gam(m.training,newdata = testdata)
  pred
  
  # Extract
  scores[i,1] <- summary(m.training)$r.sq
  devian.expl[i,1]<- summary(m.training)$dev.expl
}

scores
devian.expl


#Get average scores from each k-fold test

av.r2.GMRF <- mean(scores$V1) #0.628242
av.r2.GMRF
av.dev.GMRF <- mean(devian.expl$V1) #0.68
av.dev.GMRF

#print scores
folds<-c(1,2,3,4,5,6,7,8,9,10)
scores2 <- cbind(scores, folds)
dev<-cbind(devian.expl,folds)

cv.open.r<-ggplot(data=scores2, aes(x=folds, y=V1, group=1)) +
  geom_boxplot(color="#00b2ca") +
  geom_point() +
  ylab('R-sq.(adj)')+
  scale_x_continuous(breaks=seq(1, 10,1))+
  annotate(geom="text", x=8, y=0.71, label="model R-sq.(adj)=0.64", size=4)+
  theme_classic()
cv.open.r

#### Reproduce Figure 4 #### 

pred.gam.open.ESPI<-ggpredict(final.gam.open,terms=c("ESPI.MAR.OCT"))
plot.gam.open.ESPI<-ggplot(pred.gam.open.ESPI, aes(x,predicted))+ 
  geom_smooth(method="gam", formula = y ~s(x))+
  geom_ribbon(aes(ymin=conf.low,ymax=conf.high),alpha=.1)+
  labs(x=expression(ESPI~(MJ~m^-2 ~d^-1)),y="SOC (%)", title="c")+
  scale_y_continuous(limits = c(0, 8)) +
  theme_classic()

plot.gam.open.ESPI.dat<-plot.gam.open.ESPI + 
  geom_point(data=ESPI, aes(ESPI.MAR.OCT,MEAN.TC,color=land.cover), size = 2,alpha=.5)+
  scale_color_manual(values = c("#E69F00","#117733"),labels = c("arable", "grasslands"))+
  theme(text = element_text(size=14))+
  theme(legend.position="none")+
  annotate(geom="text", x=4.5, y=8, label="F=5.2", size=4)+
  annotate(geom="text", x=4.5, y=7.6, label="edf=2.4", size=4)

pred.gam.open.aspect<-ggpredict(final.gam.open,terms=c("aspect.mean"))
plot.gam.open.aspect<-ggplot(pred.gam.open.aspect, aes(x,predicted))+ 
  geom_smooth(method="gam", formula = y ~s(x))+
  geom_ribbon(aes(ymin=conf.low,ymax=conf.high),alpha=.1)+
  scale_y_continuous(limits = c(0, 8)) +
  labs(x="Aspect (°)",y="SOC (%)", title="b")+
  coord_cartesian(xlim=c(67.5, 337.5)) + 
  scale_x_continuous(breaks=seq(67.5, 337.5,45))+
  theme_classic()+
  theme(text = element_text(size=14))+
  annotate(geom="text", x=292, y=8, label="F=5.3", size=4)+
  annotate(geom="text", x=292, y=7.6, label="edf=4.6", size=4)+
  annotate(geom="text", x=90, y=0, label="E", size=4)+
  annotate(geom="text", x=135, y=0, label="SE", size=4)+
  annotate(geom="text", x=180, y=0, label="S", size=4)+
  annotate(geom="text", x=225, y=0, label="SW", size=4)+
  annotate(geom="text", x=270, y=0, label="W", size=4)+
  annotate(geom="text", x=315, y=0, label="NW", size=4)

plot.gam.open.aspect.dat<-plot.gam.open.aspect +  
  geom_point(data=ESPI, aes(aspect.mean, MEAN.TC,color=land.cover),size = 2,alpha=.5)+
  scale_color_manual(values = c("#E69F00","#117733"),labels = c("arable", "grasslands"))+
  theme(legend.position="none")

pred.gam.open.slope<-ggpredict(final.gam.open,terms=c("slope.mean"))
plot.gam.open.slope<-ggplot(pred.gam.open.slope, aes(x,predicted))+ 
  geom_smooth(method="gam", formula = y ~s(x))+
  geom_ribbon(aes(ymin=conf.low,ymax=conf.high),alpha=.1)+
  labs(x="Slope (°)",y="SOC (%)", title="a")+
  scale_y_continuous(limits = c(0, 8)) +
  theme_classic()+
  theme(text = element_text(size=14),
        legend.title = element_blank(),
        legend.position = c(.2, .95),
        legend.background = element_rect(fill = "transparent")) +
  annotate(geom="text", x=5.4, y=8, label="F=16.2", size=4)+
  annotate(geom="text", x=5.4, y=7.6, label="edf=6.0", size=4)

plot.gam.open.slope.dat<-plot.gam.open.slope + 
  geom_point(data=ESPI, aes(slope.mean, MEAN.TC,color= land.cover),size = 2,alpha=.5)+
  scale_color_manual(values = c("#E69F00","#117733"),labels = c("arable", "grasslands")) #add data

plot.gam.open.slope.dat + plot.gam.open.aspect.dat +plot.gam.open.ESPI.dat

# final model with land cover

final.gam.openb<- gam(MEAN.TC ~ land.cover  +s(aspect.mean, k=6)+ s(slope.mean,k=8),method = "REML",data = ESPI)  #alternative
summary(final.gam.openb)

# get diagnostic, model validation and predictions following the previous script as example

#### 5.4. Generalised additive mixed model

final.gamm.open<- gam(MEAN.TC ~ s(ESPI.MAR.OCT,k=6) + s(slope.mean,k=8) +s(Field.Name, bs="re"),method = "REML",data = ESPI)
summary(final.gamm.open) 

par(mfrow=c(2,2))
appraise(final.gamm.open, method = "simulate")& theme_bw()
concurvity(final.gamm.open, full = TRUE)
concurvity(final.gamm.open, full = FALSE) 

 #Partial effects
gamm.open<-draw(final.gamm.open,residuals=TRUE,resid_col = "grey") & 
  theme_bw()

# model validation: get predicted values

pred_gamm.open <- as.data.frame(predict.gam(final.gamm.open, ESPI, se.fit=TRUE, type="response"))
pred_gamm.open$obs <- ESPI$MEAN.TC 
pred_gamm.open$cover <- ESPI$land.cover
pred_gamm.open%>%
  na.omit()

pred_gamm.open<-filter(pred_gamm.open,!obs=="na")

str(pred_gamm.open)

# measured vs predicted grouped by cover 
open.gamm.pred.vs.obs.cover <- ggplot(pred_gamm.open, aes(x=obs, y=fit, color=cover)) +
  geom_point(shape=20, size = 2, stroke = 1) +
  geom_abline(intercept=0, slope=1) +
  xlab("SOC measured") + 
  ylab("SOC fitted")+
  scale_y_continuous(limits = c(2, 7)) +
  scale_x_continuous(limits = c(2, 7))+
  scale_color_manual(values = c("#E69F00","#117733"),labels = c("arable", "grasslands"))+
  geom_smooth(method=lm , color="red", fill="#69b3a2", se=TRUE) +
  theme_bw()+
  theme(text = element_text(size=12),
        legend.title = element_blank(),
        legend.position = c(.15, .91),
        legend.key = element_rect(fill = NA),
        legend.text = element_text(size = 12),
        legend.background = element_rect(fill = "transparent")) +
  geom_segment(aes(x = 6.3, y = 2, xend = 6.7, yend = 2),colour="black") +
  annotate(geom="text", x=6.95, y=2.03, label="1:1", size=6)
open.gamm.pred.vs.obs.cover 


# coefficient of determination
with(pred_gamm.open,cor(obs, fit, method = c("pearson")))

#RMSE
# function
rmse2 <- function(x, y, na.rm = TRUE){
  res <- sqrt(mean((x-y)^2, na.rm = na.rm))
  return(res)
}

# call to function               
rmse2(x = pred_gamm.open$obs, y = pred_gamm.open$fit)  # 0.5033977

# with eq to report as %
mean.obser<-mean(pred_gamm.open$obs)
mean.obser

rmse<-(0.5033977*(100/mean.obser)) 
rmse #10.84835

# plot model predictions following previous scripts as example

#### end ####