Removing Duplicates from data

A simple code to remove duplicates from data.  There are two versions, the second is slightly more efficient.

R-implementation

remove.duplicates <- function(the.data,out.file = "newfile.txt"
,num.data = nrow(the.data),num.var = ncol(the.data))
{drem.data <- the.data
for(i in 1:num.data) for(j in 1:num.data) {
if(i ==j) {} 
else if(sum(abs(drem.data[i,]-drem.data[j,])) == 0) {
drem.data[j,num.var] <- -1} 
}
to.keep <- array(0,0)
for(i in 1:num.data) 
if(drem.data[i,num.var]==-1) {} 
else {to.keep <- c(to.keep,i)}
write.table(drem.data[to.keep,],out.file
,row.names = FALSE, col.names = FALSE)
print(c(num.data - length(to.keep)
,"data have been removed
, leaving",length(to.keep)),row.names=FALSE
,col.names=FALSE,quote=FALSE)}
remove.duplicates.2 <- function(the.data
,out.file = "newfile.txt",num.data = nrow(the.data)
,num.var = ncol(the.data))
{clean.set<- cbind(the.data,1:num.data)
clean.set <- clean.set[order(clean.set[1]),]
to.keep <-1
for(i in 2:num.data) {
if(sum(abs(clean.set[i,1:num.var]-clean.set[(i-1)
,1:num.var])) == 0) {} 
else {to.keep <- c(to.keep,i)}
}
clean.set <- clean.set[to.keep,]
clean.set <- clean.set[order(clean.set[,(num.var+1)]),]
write.table(clean.set[,1:num.var],out.file
,row.names = FALSE, col.names = FALSE)
print(c(num.data - length(to.keep)
,"data have been removed, leaving",length(to.keep))
,row.names=FALSE,col.names=FALSE,quote=FALSE)}
Advertisements

Leave a Reply

Fill in your details below or click an icon to log in:

WordPress.com Logo

You are commenting using your WordPress.com account. Log Out /  Change )

Google+ photo

You are commenting using your Google+ account. Log Out /  Change )

Twitter picture

You are commenting using your Twitter account. Log Out /  Change )

Facebook photo

You are commenting using your Facebook account. Log Out /  Change )

Connecting to %s