| 
    
    DATA MINING
     Desktop Survival Guide by Graham Williams  | 
    
    
     
     | 
    |||
Cluster analysis can be used to find clusters that are most
  interesting according to some criteria. For example, we might
cluster the spam7 data of the DAAG package (without using yesno in the
clustering) and then score the clusters depending on the proportion of
yes cases within the cluster. The following R code will build K
clusters (user specified) and return a score for each cluster. 
# Some ideas here from Felix Andrews
kmeans.scores <- function(x, centers, cases) 
{
  clust <- kmeans(x, centers)
  # Iterate over each cluster to generate the scores
  scores <- c()
  for (i in 1:centers) 
  {
    # Count number of TRUE cases in the cluster
    # as the proportion of the cluster size
    scores[i] <- sum( cases[clust$cluster == i] == TRUE ) / clust$size[i]
  }
  # Add the scores as another element to the kmeans list
  clust$scores <- scores
  return(clust)
}
 | 
> require(DAAG)
> data(spam7)
> clust <- kmeans.scores(spam7[,1:6], centers=10, spam7["yesno"]=="y")
> clust[c("scores","size")]
$scores
 [1] 0.7037037 0.1970109 0.5995763 0.7656250 0.8043478 1.0000000 0.4911628
 [8] 0.7446809 0.6086957 0.6043956
$size
 [1]  162 2208  472  128   46    5 1075   47  276  182
 | 
Now that we have built some clusters we can generate some rules that
describe the clusters:
hotspots <- function(x, cluster, cases)
{
  require(rpart)
  overall = sum(cases) / nrow(cases)
  x.clusters <- cbind(x, cluster)
  tree = rpart(cluster ~ ., data = x.clusters, method = "class")
  # tree = prune(tree, cp = 0.06)
  nodes <- rownames(tree$frame)
  paths = path.rpart(tree, nodes = nodes)
TO BE CONTINUED
  return(tree)
}
 | 
> h <- hotspots(spam7[,1:6], clust$cluster, spam7["yesno"]=="y")  |