R code: clustering

# K-means clustering
# first example: made-up data
# Draw 100 numbers, normally distributed, with a mean of 0,
# and organize them in 2 columns so we have 50 points with 2 coordinates each.
cl1 = matrix(rnorm(100, sd = 0.3), ncol = 2)
# Draw 100 numbers with a mean of 1.5,
# again organized in 2 columns for 50 datapoints with 2 coordinates each
cl2 = matrix(rnorm(100, mean=1.5, sd = 0.3), ncol = 2)

# plotting both
plot(cl1, xlim = range(cl1, cl2), ylim = range(cl1, cl2))
points(cl2)

# combine the two sets of points, and let's see
# if the clustering method can take them apart again
data = rbind(cl1, cl2)

# doing the clustering
clustering = kmeans(data, 2)
# inspecting the result
clustering
# this looks something like this.
# "Cluster means" are the cluster centers,
# one center per line.
# "Clustering vector" assigns each datapoint in "data"
# a cluster, 1 or 2.
# "Within cluster sum of squares by cluster" is the sum of squared distances
# of the data points from their cluster mean, by cluster.
##
# K-means clustering with 2 clusters of sizes 50, 50
#
# Cluster means:
##         [,1]       [,2]
## 1 0.01778026 0.01465019
## 2 1.47658108 1.46337011

## Clustering vector:
##   [1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
##  [45] 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
##  [89] 2 2 2 2 2 2 2 2 2 2 2 2

## Within cluster sum of squares by cluster:
## [1] 9.814072 9.629475
##  (between_SS / total_SS =  84.5 %)

## Available components:

## [1] "cluster"      "centers"      "totss"        "withinss"     "tot.withinss" "betweenss"  
## [7] "size"         "iter"         "ifault"     


# how good is the clustering
# withinss: sum of squared distances between each datapoint and its
# cluster mean
# betweenss: sum of squared distances between each cluster mean and
# the data mean
clustering$withinss
clustering$betweenss

# visualizing the results of clustering
plot(data, col = clustering$cluster, pch=clustering$cluster)
# adding the cluster centers
points(clustering$centers, col = 1:2, pch = 8, cex = 2)

# for a more challenging dataset, set the second mean to 1.0,
# or try 3-4 clusters
# In that case do nstart = 25

#######
# K-means clustering, real dataset
# NYT opinion articles
nyt = read.table("nyt_opinion.txt", header=T)

content.words = nyt[,13:29]

# do the clustering of articles by content words.
# 4 clusters, 25 attempts with different random starting points.
nyt.cl = kmeans(content.words, 4, nstart = 25)

# inspecting the clusters
nyt$cl = nyt.cl$cluster
# How do the authors distribute over the clusters?
xtabs(~nyt$cl + nyt$Author)
# how do content word counts look like by cluster?
aggregate(content.words, list(nyt$cl), sum)

#############
# Hierarchical clustering
# clustering languages by similarity of morphosyntactic characteristics

library(languageR)

# make an object with pairwise distances of languages
dist.languages = dist(phylogeny[3:ncol(phylogeny)], method="binary")
# do the clustering, plot
# Suggestion: try out methods "ward", "single", "complete", "average" and see how
# the clustering changes.
clustering = hclust(dist.languages, method="ward")
plot(clustering)

# make it so we can see which languages got clustered
plotnames = as.character(phylogeny$Language)
plot(clustering, labels=plotnames, cex=0.8)

###############
# Dimensionality reduction:
# Code from the Baayen book

library(languageR)
# affixProductivity dataset: stems can be combined with different
# affixes, for example warm+th, clever+ness
#
# Question: affix productivity related to stylistic factors?
# affixProductivity dataset lists, for 44 texts with varying
# authors and genres, a productivity index for 27 affixes. Genres:
# religious, children's books, literary texts, and others.

# Here: use PCA to explore what dimensions give relevant information
# about the data. All columns except the last three have affixes. Last three
# columns encode info about author, genre, and birth year of author.

# doing the principal component analysis (PCA)
# run with scale=T, as PCA is sensitive to
# different values being on different scales
affixes.pr = prcomp(affixProductivity[,1:(ncol(affixProductivity)-3)], scale=T)

# what do we have in the resulting object?
names(affixes.pr)
# Result:
# [1] "sdev"     "rotation" "center"   "scale"    "x"

# summary(affixes.pr) prints the standard deviation for each
# principal component, and the amount of variance explained by
# it. Proportion of variance: squared sdev divided by sum of squared
# sdevs. standard deviation is also accessible via
# affixes.pr$sdev

summary(affixes.pr)


# this is the cumulative variance explained for principal components 1-k.
# This is in the summary, but you can also get it as a vector like this:
affixes.pr$sdev^2 / sum(affixes.pr$sdev^2)

# Rules of thumb:
# * keep only principal components that account for at
#    least 5% of the variance.
# * visualize proportion of variance explained by each principal
#   component. Only keep components up to first major drop in the graph

# visualization:
plot(affixes.pr)

# The original dataset affixProductivity has one book per line.
# Each of these books has coordinates on the new principal components.
# Here are the coordinates on the first three principal components
# for some books
affixes.pr$x[c("Mormon", "Austen", "Carroll", "Gao"), 1:3]

# How can we interpret principal components?
# One way is to look at the ``loadings'' of the affixes on the
# principal components. Flip the matrix around, view these loadings as
# coordinates of affixes with new dimensions.
# Interpret a principal component by checking which affixes
# have a large positive or negative value with it.
# Loadings are available in affixes.pr$rotation

affixes.pr$rotation[1:10, 1:3]

####
# PCA as soft clustering
# texts that have high values on the same
# principal dimensions, or affixes that have high loadings on the same
# principal dimensions, can be viewed as more or less clustered. Each
# principal component can then be viewed as a soft cluster.

####
# Visualization
# Visualize clustering of texts and clustering of affixes in the same
# plot: biplot.

biplot(affixes.pr, scale = 0, var.axes = F,
col = c("darkgrey", "black"), cex = c(0.9, 1.2))


Comments