WITH x(v) AS (VALUES (1),(2),(NULL))
SELECT l.v, r.v, l.v = r.v AS equality,
l.v IS NOT DISTINCT FROM r.v AS isnotdistinctfrom
FROM x l, x r;
Thanks to Sam Mason via the PostgreSQL - general mailing list.
Work-products and stray thoughts from the Land of Entrapment
WITH x(v) AS (VALUES (1),(2),(NULL))
SELECT l.v, r.v, l.v = r.v AS equality,
l.v IS NOT DISTINCT FROM r.v AS isnotdistinctfrom
FROM x l, x r;
## box-and-whisker plot of FFT power spectrum by frequency/period
## vary these parameters -- r as per logistic map
## see http://en.wikipedia.org/wiki/Logistic_map for details
logmap = function(n, r, x0=0.5) {
ret = rep(x0,n)
for (ii in 2:n) {
ret[ii] = r*ret[ii-1]*(1-ret[ii-1])
}
return(ret)
}
## modify these for interesting behavior differences
rfrom = 3.4
rto = 3.7
rsteps=200
seqlen=1e4
require(plyr)
require(ggplot2)
mytimeseries = aaply(seq(from=rfrom, to=rto, length.out=rsteps), 1,
function(x) {
logmap(seqlen, x)
})
## you can plug in any array here for mytimeseries
## each row is a timeseries
## for series by column, change the margin from 1 to 2, below
logspec = adply( mytimeseries, 1, function(x) {
## change spec.pgram parameters as per goals
ret = spectrum(x, taper=0, demean=T, pad=0, fast=T, plot=F)
return( data.frame(freq=ret$freq, spec=ret$spec))
})
## boxplot stats at each unique frequency
logspec.bw = ddply(logspec, 'freq', function(x) {
ret = boxplot.stats(log10(x$spec));
## sometimes $out is empty, bind NA to keep dimensions correct
return(data.frame(t(10^ret$stats), out=c(NA,10^ret$out)))
})
## plot -- outliers are dots
## be careful with spacing of hard returns -- ggplot has odd rules
## as close to default "spectrum" plot as possible
ggplot(logspec.bw, aes(x=1/(freq))) + geom_ribbon(aes(ymin=X1,
ymax=X5), alpha=0.35, fill='green') +
geom_ribbon(aes(ymin=X2, ymax=X4), alpha=0.5, fill='blue') +
geom_line(aes(y=X3)) +
geom_point(aes(y=out), color='darkred') +
scale_x_continuous( trans=c('inverse'), name='Period') +
scale_y_continuous(name='Power', trans='log10')
require(animation)
saveVideo({
ani.options(interval = 1/5, ani.type='png', ani.width=1600, ani.height=1200)
for (ii in c(seq(from=3.2, to=3.5, length.out=200), seq(from=3.5, to=4, length.out=1200))) {
spectrum(logmap(1e4, ii), taper=0, demean=T, pad=0, fast=T, sub=round(ii, 3))
}
}, video.name = "logmap2.mp4", other.opts = "-b 600k")
## using R --
## logistic map function:
logmap = function(n, r, x0=0.5) {
ret = rep(x0,n)
for (ii in 2:n) {
ret[ii] = r*ret[ii-1]*(1-ret[ii-1])
}
return(ret)
}
## compare:
spectrum(logmap(1e5, 3.5), main='R tapers to 0.1 by default')
spectrum(logmap(1e5, 3.5), taper=0, main='No taper')
spectrum(logmap(1e5+1, 3.5), taper=0, main='Non-composite length, fast=T pads by default')
spectrum(logmap(1e5, 3.5), taper=0.5, sub='Lots of padding approximately recovers "correct" results')
## use aaply -- the simplest code
require(plyr)
testfun0 <- function(x, y) aaply(x, 1, function(x) rbinom(1, x, y)
## rewrite the above as an explicit loop
testfun1 = function(nrep, x, y) {
## loop index
ii<-0;
## result vector
ret<-rep(0, nrep);
while(ii < nrep) {
ii<-ii+1;
## how many successes for each element of bb?
ret[ii] <- rbinom(1, x[ii], y)
}
return(ret)
}
## define source code as string, pass to inline
src <- '
// Cast input parameters to correct types
// clone prevents direct modification of the input object
IntegerVector tmp(clone(x));
double rate = as< double >(y);
// IntegerVector inherits from STL vector, so we get standard STL methods
int tmpsize = tmp.size();
// initialize RNG from R set.seed() call
RNGScope scope;
// loop
for (int ii =0; ii < tmpsize; ii++) {
// Rf_rbinom is in the R api
// For more details, See Rcpp-quickref.pdf, "Random Functions"
// also, Rcpp provides "safe" accessors via, e.g., tmp(ii)
tmp(ii) = Rf_rbinom(tmp(ii), rate);
};
// tmp points to a native R object, so we can return it as-is
// if we wanted to return, e.g., the double rate, use:
// return wrap(rate)
return tmp;
'
require(inline)
## compile the function, inspect the process with verbose=T
testfun2 = cxxfunction(signature(x='integer', y='numeric'), src, plugin='Rcpp', verbose=T)
## timing function
ps <- function(x) print(system.time(x))
## Input vector
bb <- rbinom(1e5, 20, 0.5)
## test each case for 2 different loop lengths
for( nrep in c(1e4, 1e5)){
## initialize RNG each time with the same seed
## plyr
set.seed(20); ps(cc0<- testfun0(nrep[1:nrep], bb, 0.1))
## explicit while loop
set.seed(20); ps(cc1<- testfun1(nrep, bb, 0.1))
## Rcpp
set.seed(20); ps(cc2<- testfun2(bb[1:nrep], 0.1))
print(all.equal(cc1, cc2))
}
## output
user system elapsed
3.392 0.020 3.417
user system elapsed
0.116 0.000 0.119
user system elapsed
0.000 0.000 0.002
[1] TRUE
user system elapsed
37.534 0.064 37.601
user system elapsed
1.228 0.000 1.230
user system elapsed
0.020 0.000 0.021
[1] TRUE
## Using Rcpp, inline and STL algorithms and containers
## use the std::binary_function templated struct
inc <- '
struct Rb : std::binary_function< double, double, double > {
double operator() (double size, double prob) const { return ::Rf_rbinom(size, prob); }
};
'
## define source code as a character vector, pass to inline
src <- '
NumericVector sz(x);
RNGScope scope;
return NumericVector::import_transform(sz.begin(), sz.end(), std::bind2nd(Rb(), as< double >(y)));
'
## compile the function, inspect the process with verbose=TRUE
f5 <- cxxfunction(signature(x='numeric', y='numeric'), src, 'Rcpp', inc, verbose=TRUE)
myhist = function(x, dig=3) {
x=trunc(x, digits=dig);
## x=round(x, digits=dig);
aa = bb = seq(0,1,1/10^dig);
for (ii in 1:length(aa)) {
aa[ii] = sum(x==aa[ii])
};
return(cbind(bin=bb, dens=aa/length(x)))
}
## random variates
test = sort(runif(1e4))
get1 = myhist(test)
## their method require(ash) get2 = bin1(test, c(0,1), 1e3+1)$nc
mymatrix = matrix(1:10, nrow=10) apply(mymatrix, 1, sum) apply(mymatrix, 2, sum)
rowSums(mymatrix) colSums(mymatrix)
myarray = array(0, dim=c(3,4,5,6,10), dimnames=list(a=1:3, b=1:4, c=1:5, d=letters[1:6], e=LETTERS[1:10])) melt( adply(myarray, 1:4, I))
An unexpeceted bonus? This way of thinking lends itself nicely to thinking of explicit parallelism. If you can describe the problem as a single recipe that's done for each "piece" of a whole, then you're one step away from using all the cores on your machine to solve a problem with foreach. Need to apply some sort of long-running analysis to each element of a list, and want to return the results as a list? Why write a for loop when you can do:
## prep
## warning -- this may not be fast
## use smaller matrices for smaller machines!
nn = 2^20
rr = 2^10
mylist = list(a=matrix(rnorm(nn), nrow=rr), b=matrix(rnorm(nn), nrow=rr), c=matrix(rnorm(nn), nrow=rr))
## analysis
myeigs = foreach( ii = iter(mylist), .combine=c) %do% { print('ping'); return(list(eigen(ii)))}
require(multicore)
require(doMC)
## use the appropriate # of cores, of course
registerDoMC(cores=4)
myeigs1 = foreach( ii = iter(mylist), .combine=c) %dopar% { print('ping'); return(list(eigen(ii)))}
Error: cannot allocate vector of size 818.6 Mb
Enter a frame number, or 0 to exit
...
4: mk.bootrep(zz, 75)
5: mk.boot.R#165: aaply(tmpreps, .margins = 1, function(x) {
6: laply(.data = pieces, .fun = .fun, ..., .progress = .progress, .drop = .dro
7: list_to_array(res, attr(.data, "split_labels"), .drop)
8: unname(unlist(res))
dimcurse = function(dims=1:1e3, .sd=c(1e-2, 0.1, .2, .5, 1), reps=1e3, cores=4,
rfun=rnorm, .methods=c('manhattan', 'euclidean', 'maximum'), .verbose=F){
## examine distance between 2 points in dims dimensions for different metrics
## chosen at random according to rfun (rnorm by default)
##
require(foreach)
require(doMC)
registerDoMC
registerDoMC(cores=cores)
## use multicore/foreach on the outside (each sd)
finret = foreach(ss=iter(.sd), .combine=rbind, .verbose=.verbose) %dopar% {
## pre-allocate, don't append/rbind!
ret = data.frame(.sd=1:(reps*length(dims))*length(methods), .dim=1, .rep=1, .dist=1, method=factor(NA, levels=.methods))
ii = 1
for (.dim in dims) {
for (method in .methods) {
## reset .rep counter
.rep = 1
while (.rep <= reps) {
## 2 points, one per row, randomly chosen
tmp = matrix(rfun(2*.dim, sd=ss), nrow=2)
## calc dist
mydist = as.numeric(dist(tmp, method=method))
## fill return dataframe
ret[ii, ] = list(ss, .dim, .rep, mydist, method)
## update rep counter
.rep = .rep +1
## update ret row counter
ii = ii +1
}
}
}
return(ret)
} ## rbind happens here
return(finret)
}
For plotting, I've used:
xyplot(.dist ~ (.dim)|method, groups=.sd, data =aa, pch='.',
par.settings=list(background=list(col='darkgrey')), layout=c(3,1),
scales=list(y=list(relation='free'), alternating=1),
auto.key=list(space='bottom', columns=5, points=F, lines=T),
xlab='Dimension', ylab='Distance', main='Distance between 2 N-dimensional points \n drawn from a normal distribution with varying sd (legend) using different metrics (columns)',
)
xyplot(.dist ~ (.dim)|as.character(.sd), groups=method, data =aa, pch='.',
par.settings=list(background=list(col='darkgrey')), layout=c(5,1),
scales=list(y=list(relation='free'), alternating=1),
auto.key=list(space='bottom', columns=3, points=F, lines=T),
xlab='Dimension', ylab='Distance', main='Distance between 2 N-dimensional points \n drawn from a normal distribution with varying SD (columns) using different metrics (legend)',
)
$ ssh -f -R 10187:localhost:10187 remotehost sleep 600 ## 10 minutes to set up connection
> require(snow); require(foreach); require(doSNOW);
> cl = makeCluster(rep('localhost',3), type='SOCK', manual=T) ## wait for 3 slaves
## Use ssh to start the slaves on remotehost and connect to the (local) forwarded port
$ for i in {1..3};
$ do ssh -f remotehost "/usr/lib64/R/bin/Rscript ~/R/x86_64-pc-linux-gnu-library/2.12/snow/RSOCKnode.R
$ MASTER=localhost PORT=10187 OUT=/dev/null SNOWLIB=~/R/x86_64-pc-linux-gnu-library/2.12";
$ done
## R should have picked up the slaves and returned. If not, something is wrong.
## Back up and test ssh connectivity, etc.
> registerDoSNOW(cl)
> a <- matrix(1:4e6, nrow=4)
> b <- t(a)
> bb <- foreach(b=iter(b, by='col'), .combine=cbind) %dopar%
> (a %*% b)
create schema transtest;
set searchpath to transtest,public;
-- it's nice to have a sandbox
-- remember to reset searchpath when done
create table myents ( id serial primary key, sum integer);
-- the entries table
create table myrecs ( id serial primary key, entid integer references myents (id), cases integer);
-- the reports table, many reports per entry
create table myids ( id integer primary key references myents (id));
-- table used to store trigger state
-- function for when-clause (condition) in trigger
-- add id to the "to-do" list if it's not there already
-- return True only if it's been added to the least
create or replace function f_condition(myid integer) returns boolean as $_$
DECLARE
myret boolean := True;
newid integer;
BEGIN
RAISE NOTICE 'Return Value here is %', myret;
select count(*) into newid from myids where id = myid;
if newid > 0 then -- id is there, return False
myret := False;
else -- add id to queue, return True
insert into myids (id) values (myid);
end if;
return myret;
END;
$_$ language plpgsql;
-- trigger function to rebuild myent.sum for this myrecs.id
create or replace function f_refresh() returns trigger as $_$
DECLARE
mysum integer;
myid integer;
BEGIN
IF (TG_OP = 'DELETE') THEN
myid := OLD.entid;
ELSE
myid := NEW.entid;
END IF;
RAISE NOTICE 'Return Value here is %', myid;
select sum(cases) into mysum from myrecs where entid = myid;
-- compute entry sum from records and update it.
-- PL/pgSQL rules on queries are quirky. Read the docs.
update myents set sum = mysum where id = myid;
delete from myids where id = myid;
-- clean up
RETURN NULL;
END;
$_$ language plpgsql;
-- only fire trigger once per myent.id using when clause (condition)
-- eval is deferred until transaction concludes
create constraint trigger t_cache after insert or update
on myrecs initially deferred for each row
when ( f_condition(NEW.entid) ) -- comment out for pre pg 9.0
execute procedure f_refresh();
-- need 2, one for each for NEW (insert and update) and OLD (delete)
create constraint trigger t_cache_del after delete
on myrecs initially deferred for each row
when ( f_condition(OLD.entid) )
execute procedure f_refresh();
insert into myents (sum) VALUES ( 0);
begin;
insert into myrecs (entid, cases) VALUES ( 1, 0), (1, 1), (1,2);
select * from myents; -- 0
select * from myids; -- entid from above
commit;
select * from myents; -- new sum
select * from myids; -- empty