The first part of the code that accumulates the data is as follows:
con <- url("http://www2.math.su.se/~esbj/GLMbook/moppe.sas") data <- readLines(con, n = 200L, warn = FALSE, encoding = "unknown") close(con) ## Find the data range data.start <- grep("^cards;", data) + 1L data.end <- grep("^;", data[data.start:999L]) + data.start - 2L table.1.2 <- read.table(text = data[data.start:data.end], header = FALSE, sep = "", quote = "", col.names = c("premiekl", "moptva", "zon", "dur", "medskad", "antskad", "riskpre", "helpre", "cell"), na.strings = NULL, colClasses = c(rep("factor", 3), "numeric", rep("integer", 4), "NULL"), comment.char = "") rm(con, data, data.start, data.end) # Remainder of Script adds comments/descriptions comment(table.1.2) <- c("Title: Partial casco moped insurance from Wasa insurance, 1994--1999", "Source: http://www2.math.su.se/~esbj/GLMbook/moppe.sas", "Copyright: http://www2.math.su.se/~esbj/GLMbook/") ## See the SAS code for this derived field table.1.2$skadfre = with(table.1.2, antskad / dur) ## English language column names as comments: comment(table.1.2$premiekl) <- c("Name: Class", "Code: 1=Weight over 60kg and more than 2 gears", "Code: 2=Other") comment(table.1.2$moptva) <- c("Name: Age", "Code: 1=At most 1 year", "Code: 2=2 years or more") comment(table.1.2$zon) <- c("Name: Zone", "Code: 1=Central and semi-central parts of Sweden's three largest cities", "Code: 2=suburbs and middle-sized towns", "Code: 3=Lesser towns, except those in 5 or 7", "Code: 4=Small towns and countryside, except 5--7", "Code: 5=Northern towns", "Code: 6=Northern countryside", "Code: 7=Gotland (Sweden's largest island)") comment(table.1.2$dur) <- c("Name: Duration", "Unit: year") comment(table.1.2$medskad) <- c("Name: Claim severity", "Unit: SEK") comment(table.1.2$antskad) <- "Name: No. claims" comment(table.1.2$riskpre) <- c("Name: Pure premium", "Unit: SEK") comment(table.1.2$helpre) <- c("Name: Actual premium", "Note: The premium for one year according to the tariff in force 1999", "Unit: SEK") comment(table.1.2$skadfre) <- c("Name: Claim frequency", "Unit: /year") ## Save results for later save(table.1.2, file = "table.1.2.RData") ## Print the table (not as pretty as the book) print(table.1.2)
The resultant first 10 rows of the table are as follows:
Then, we go through each product/statistics to determine whether the pricing for a product is in line with others. Note, therepos =clause on theinstall.packagesstatement is a fairly new addition to R:
# make sure the packages we want to use are installed install.packages(c("data.table", "foreach", "ggplot2"), dependencies = TRUE, repos = "http://cran.us.r-project.org") # load the data table we need if (!exists("table.1.2")) load("table.1.2.RData") library("foreach") ## We are looking to reproduce table 2.7 which we start building here, ## add columns for our results. table27 <- data.frame(rating.factor = c(rep("Vehicle class", nlevels(table.1.2$premiekl)), rep("Vehicle age", nlevels(table.1.2$moptva)), rep("Zone", nlevels(table.1.2$zon))), class = c(levels(table.1.2$premiekl), levels(table.1.2$moptva), levels(table.1.2$zon)), stringsAsFactors = FALSE) ## Calculate duration per rating factor level and also set the ## contrasts (using the same idiom as in the code for the previous ## chapter). We use foreach here to execute the loop both for its ## side-effect (setting the contrasts) and to accumulate the sums. # new.cols are set to claims, sums, levels new.cols <- foreach (rating.factor = c("premiekl", "moptva", "zon"), .combine = rbind) %do% { nclaims <- tapply(table.1.2$antskad, table.1.2[[rating.factor]], sum) sums <- tapply(table.1.2$dur, table.1.2[[rating.factor]], sum) n.levels <- nlevels(table.1.2[[rating.factor]]) contrasts(table.1.2[[rating.factor]]) <- contr.treatment(n.levels)[rank(-sums, ties.method = "first"), ] data.frame(duration = sums, n.claims = nclaims) } table27 <- cbind(table27, new.cols) rm(new.cols) #build frequency distribution model.frequency <- glm(antskad ~ premiekl + moptva + zon + offset(log(dur)), data = table.1.2, family = poisson) rels <- coef( model.frequency ) rels <- exp( rels[1] + rels[-1] ) / exp( rels[1] ) table27$rels.frequency <- c(c(1, rels[1])[rank(-table27$duration[1:2], ties.method = "first")], c(1, rels[2])[rank(-table27$duration[3:4], ties.method = "first")], c(1, rels[3:8])[rank(-table27$duration[5:11], ties.method = "first")]) # note the severities involved model.severity <- glm(medskad ~ premiekl + moptva + zon, data = table.1.2[table.1.2$medskad > 0, ], family = Gamma("log"), weights = antskad) rels <- coef( model.severity ) rels <- exp( rels[1] + rels[-1] ) / exp( rels[1] ) ## Aside: For the canonical link function use ## rels <- rels[1] / (rels[1] + rels[-1]) table27$rels.severity <- c(c(1, rels[1])[rank(-table27$duration[1:2], ties.method = "first")], c(1, rels[2])[rank(-table27$duration[3:4], ties.method = "first")], c(1, rels[3:8])[rank(-table27$duration[5:11], ties.method = "first")]) table27$rels.pure.premium <- with(table27, rels.frequency * rels.severity) print(table27, digits = 2)
The resultant display is as follows:
rating.factor class duration n.claims rels.frequency rels.severity1 Vehicle class 1 9833 391 1.00 1.002 Vehicle class 2 8825 395 0.78 0.5511 Vehicle age 1 1918 141 1.55 1.7921 Vehicle age 2 16740 645 1.00 1.0012 Zone 1 1451 206 7.10 1.2122 Zone 2 2486 209 4.17 1.073 Zone 3 2889 132 2.23 1.074 Zone 4 10069 207 1.00 1.005 Zone 5 246 6 1.20 1.216 Zone 6 1369 23 0.79 0.987 Zone 7 148 3 1.00 1.20 rels.pure.premium1 1.002 0.4211 2.7821 1.0012 8.6222 4.483 2.384 1.005 1.466 0.787 1.20
Here, we can see that some vehicle classes (2,6) are priced very low in comparison to statistics for that vehicle where as other are overpriced (12, 22).