書名： Jupyter for Data Science
作者名： Dan Toomey
本章字數： 903字
更新時間： 2021-07-08 09:22:34

Insurance, R - non-life insurance pricing

We have an example of using R to come up with the pricing for non-life products, specifically mopeds, at http://www.cybaea.net/journal/2012/03/13/R-code-for-Chapter-2-of-Non_Life-Insurance-Pricing-with-GLM/. The code first creates a table of the statistics available for the product line, then compares the pricing to actual statistics in use.

The first part of the code that accumulates the data is as follows:

con <- url("http://www2.math.su.se/~esbj/GLMbook/moppe.sas")
data <- readLines(con, n = 200L, warn = FALSE, encoding = "unknown")
close(con)
## Find the data range
data.start <- grep("^cards;", data) + 1L
data.end   <- grep("^;", data[data.start:999L]) + data.start - 2L
table.1.2  <- read.table(text = data[data.start:data.end],
                       header = FALSE,
                       sep = "",
                       quote = "",
col.names = c("premiekl", "moptva", "zon", "dur",
              "medskad", "antskad", "riskpre", "helpre", "cell"),
                         na.strings = NULL,
                         colClasses = c(rep("factor", 3), "numeric",
                                        rep("integer", 4), "NULL"),
                                            comment.char = "")
rm(con, data, data.start, data.end)    
# Remainder of Script adds comments/descriptions
comment(table.1.2) <-
  c("Title: Partial casco moped insurance from Wasa insurance, 1994--1999",
    "Source: http://www2.math.su.se/~esbj/GLMbook/moppe.sas",
    "Copyright: http://www2.math.su.se/~esbj/GLMbook/")
## See the SAS code for this derived field
table.1.2$skadfre = with(table.1.2, antskad / dur)
## English language column names as comments:
comment(table.1.2$premiekl) <-
  c("Name: Class",
    "Code: 1=Weight over 60kg and more than 2 gears",
    "Code: 2=Other")
comment(table.1.2$moptva)   <-
  c("Name: Age",
    "Code: 1=At most 1 year",
    "Code: 2=2 years or more")
comment(table.1.2$zon)      <-
  c("Name: Zone",
    "Code: 1=Central and semi-central parts of Sweden's three largest cities",
    "Code: 2=suburbs and middle-sized towns",
    "Code: 3=Lesser towns, except those in 5 or 7",
    "Code: 4=Small towns and countryside, except 5--7",
    "Code: 5=Northern towns",
    "Code: 6=Northern countryside",
    "Code: 7=Gotland (Sweden's largest island)")
comment(table.1.2$dur)      <-
  c("Name: Duration",
    "Unit: year")
comment(table.1.2$medskad)  <-
  c("Name: Claim severity",
    "Unit: SEK")
comment(table.1.2$antskad)  <- "Name: No. claims"
comment(table.1.2$riskpre)  <-
  c("Name: Pure premium",
    "Unit: SEK")
comment(table.1.2$helpre)   <-
  c("Name: Actual premium",
    "Note: The premium for one year according to the tariff in force 1999",
    "Unit: SEK")
comment(table.1.2$skadfre)  <-
  c("Name: Claim frequency",
    "Unit: /year")
## Save results for later
save(table.1.2, file = "table.1.2.RData")
## Print the table (not as pretty as the book)
print(table.1.2)

The resultant first 10 rows of the table are as follows:

       premiekl moptva zon    dur medskad antskad riskpre helpre    skadfre
    1         1      1   1   62.9   18256      17    4936   2049 0.27027027
    2         1      1   2  112.9   13632       7     845   1230 0.06200177
    3         1      1   3  133.1   20877       9    1411    762 0.06761833
    4         1      1   4  376.6   13045       7     242    396 0.01858736
    5         1      1   5    9.4       0       0       0    990 0.00000000
    6         1      1   6   70.8   15000       1     212    594 0.01412429
    7         1      1   7    4.4    8018       1    1829    396 0.22727273
    8         1      2   1  352.1    8232      52    1216   1229 0.14768532
    9         1      2   2  840.1    7418      69     609    738 0.08213308
    10        1      2   3 1378.3    7318      75     398    457 0.05441486

Then, we go through each product/statistics to determine whether the pricing for a product is in line with others. Note, the repos = clause on the install.packages statement is a fairly new addition to R:

# make sure the packages we want to use are installed
install.packages(c("data.table", "foreach", "ggplot2"), dependencies = TRUE, repos = "http://cran.us.r-project.org")
# load the data table we need
if (!exists("table.1.2"))
  load("table.1.2.RData")
library("foreach")
## We are looking to reproduce table 2.7 which we start building here,
## add columns for our results.
table27 <-
  data.frame(rating.factor =
               c(rep("Vehicle class", nlevels(table.1.2$premiekl)),
                 rep("Vehicle age",   nlevels(table.1.2$moptva)),
                 rep("Zone",          nlevels(table.1.2$zon))),
             class =
               c(levels(table.1.2$premiekl),
                 levels(table.1.2$moptva),
                 levels(table.1.2$zon)),
             stringsAsFactors = FALSE)
## Calculate duration per rating factor level and also set the
## contrasts (using the same idiom as in the code for the previous
## chapter). We use foreach here to execute the loop both for its
## side-effect (setting the contrasts) and to accumulate the sums.
# new.cols are set to claims, sums, levels
new.cols <-
  foreach (rating.factor = c("premiekl", "moptva", "zon"),
           .combine = rbind) %do%
{
  nclaims <- tapply(table.1.2$antskad, table.1.2[[rating.factor]], sum)
  sums <- tapply(table.1.2$dur, table.1.2[[rating.factor]], sum)
  n.levels <- nlevels(table.1.2[[rating.factor]])
  contrasts(table.1.2[[rating.factor]]) <-
    contr.treatment(n.levels)[rank(-sums, ties.method = "first"), ]
  data.frame(duration = sums, n.claims = nclaims)
}
table27 <- cbind(table27, new.cols)
rm(new.cols)
#build frequency distribution
model.frequency <-
  glm(antskad ~ premiekl + moptva + zon + offset(log(dur)),
      data = table.1.2, family = poisson)
rels <- coef( model.frequency )
rels <- exp( rels[1] + rels[-1] ) / exp( rels[1] )
table27$rels.frequency <-
    c(c(1, rels[1])[rank(-table27$duration[1:2], ties.method = "first")],
    c(1, rels[2])[rank(-table27$duration[3:4], ties.method = "first")],
    c(1, rels[3:8])[rank(-table27$duration[5:11], ties.method = "first")])
# note the severities involved
model.severity <-
  glm(medskad ~ premiekl + moptva + zon,
      data = table.1.2[table.1.2$medskad > 0, ],
      family = Gamma("log"), weights = antskad)
rels <- coef( model.severity )
rels <- exp( rels[1] + rels[-1] ) / exp( rels[1] )
## Aside: For the canonical link function use
## rels <- rels[1] / (rels[1] + rels[-1])
table27$rels.severity <-
    c(c(1, rels[1])[rank(-table27$duration[1:2], ties.method = "first")],
    c(1, rels[2])[rank(-table27$duration[3:4], ties.method = "first")],
    c(1, rels[3:8])[rank(-table27$duration[5:11], ties.method = "first")])
table27$rels.pure.premium <- with(table27, rels.frequency * rels.severity)
print(table27, digits = 2)

The resultant display is as follows:

       rating.factor class duration n.claims rels.frequency rels.severity
    1  Vehicle class     1     9833      391           1.00          1.00
    2  Vehicle class     2     8825      395           0.78          0.55
    11   Vehicle age     1     1918      141           1.55          1.79
    21   Vehicle age     2    16740      645           1.00          1.00
    12          Zone     1     1451      206           7.10          1.21
    22          Zone     2     2486      209           4.17          1.07
    3           Zone     3     2889      132           2.23          1.07
    4           Zone     4    10069      207           1.00          1.00
    5           Zone     5      246        6           1.20          1.21
    6           Zone     6     1369       23           0.79          0.98
    7           Zone     7      148        3           1.00          1.20
       rels.pure.premium
    1               1.00
    2               0.42
    11              2.78
    21              1.00
    12              8.62
    22              4.48
    3               2.38
    4               1.00
    5               1.46
    6               0.78
    7               1.20

Here, we can see that some vehicle classes (2,6) are priced very low in comparison to statistics for that vehicle where as other are overpriced (12, 22).

官术网_书友最值得收藏!

Jupyter for Data Science

Insurance, R - non-life insurance pricing