/*
* Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License").
* You may not use this file except in compliance with the License.
* A copy of the License is located at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* or in the "license" file accompanying this file. This file is distributed
* on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
* express or implied. See the License for the specific language governing
* permissions and limitations under the License.
*/
package software.amazon.neptune.csv2rdf;
import java.io.UnsupportedEncodingException;
import java.net.URI;
import java.net.URISyntaxException;
import java.net.URLEncoder;
import java.nio.charset.StandardCharsets;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Map;
import java.util.Objects;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.regex.PatternSyntaxException;
import org.eclipse.rdf4j.model.IRI;
import org.eclipse.rdf4j.model.impl.SimpleValueFactory;
import org.eclipse.rdf4j.model.vocabulary.RDF;
import com.fasterxml.jackson.annotation.JsonCreator;
import com.fasterxml.jackson.annotation.JsonProperty;
import lombok.Getter;
import lombok.NonNull;
import lombok.ToString;
import lombok.extern.slf4j.Slf4j;
/**
*
URI Post Transformation
*
* URI Post Transformations are used to transform RDF resource IRIs into more
* readable ones.
*
* An URI Post Transformation consists of four elements:
*
*
* {@code
* uriPostTransformation..srcPattern=
* uriPostTransformation..typeUri=
* uriPostTransformation..propertyUri=
* uriPostTransformation..dstPattern=
* }
*
*
* A positive integer {@code } is required to group the elements. The
* grouping numbers of several transformation configurations do not need to be
* consecutive. The transformation rules will be executed in ascending order
* according to the grouping numbers. All four configuration items are required:
*
*
* - {@code srcPattern} is a URI with a single regular expression group, e.g.
* {@code }, defining
* the URI patterns of RDF resources to which the post transformation applies.
*
- {@code typeUri} filters out all matched source URIs that do not belong to
* the specified RDF type.
*
- {@code propertyUri} is the RDF predicate pointing to the replacement
* value.
*
- {@code dstPattern} is the new URI, which must contain a
* {{VALUE}} substring which is then substituted with the value of
* {@code propertyUri}.
*
*
* Example:
*
*
* uriPostTransformation.1.srcPattern=http://example.org/resource/([0-9]+)
* uriPostTransformation.1.typeUri=http://example.org/class/Country
* uriPostTransformation.1.propertyUri=http://example.org/datatypeProperty/code
* uriPostTransformation.1.dstPattern=http://example.org/resource/{{VALUE}}
*
*
* This configuration transforms the URI {@code http://example.org/resource/123}
* into {@code http://example.org/resource/FR}, given that there are the
* statements:
* {@code http://example.org/resource/123} a
* {@code http://example.org/class/Country}.
* {@code http://example.org/resource/123}
* {@code http://example.org/datatypeProperty/code} "FR".
*
*
* Note that we assume that the property {@code propertyUri} is unique for each
* resource, otherwise a runtime exception will be thrown. Also note that the
* post transformation is applied using a two-pass algorithm over the generated
* data, and the translation mapping is kept fully in memory. This means the
* property is suitable only in cases where the number of mappings is small or
* if the amount of main memory is large.
*
*/
@Slf4j
@ToString(includeFieldNames = true)
public class UriPostTransformation {
@Getter
private final Pattern srcPattern;
@Getter
private final String typeUri;
@Getter
private final String propertyUri;
@Getter
private final String dstPattern;
@ToString.Exclude
private final Set resources = new HashSet<>();
@ToString.Exclude
private final Map resource2Value = new HashMap<>();
@ToString.Exclude
private final SimpleValueFactory vf = SimpleValueFactory.getInstance();
/**
* Create a URI post transformation rule.
*
* @param srcPattern URI with a single regular expression group, e.g.
* {@code },
* defining the URI patterns of RDF resources to which the
* post transformation applies.
* @param typeUri RDF type URI to filter out all matched source URIs that do
* not belong to the specified RDF type.
* @param propertyUri An RDF predicate pointing to the replacement value.
* @param dstPattern is the new URI, which must contain a {{VALUE}}
* substring which is then substituted with the value of
* {@code propertyUri}.
* @throws Csv2RdfException if the regex of scrPattern is invalid or dstPattern
* does not contain {{VALUE}}
*/
@JsonCreator
public UriPostTransformation(@JsonProperty(value = "srcPattern", required = true) @NonNull String srcPattern,
@JsonProperty(value = "typeUri", required = true) @NonNull String typeUri,
@JsonProperty(value = "propertyUri", required = true) @NonNull String propertyUri,
@JsonProperty(value = "dstPattern", required = true) @NonNull String dstPattern) {
try {
this.srcPattern = Pattern.compile(srcPattern);
} catch (PatternSyntaxException e) {
throw new Csv2RdfException("Regex is bad. " + e.getMessage() + ".", e);
}
this.typeUri = typeUri;
this.propertyUri = propertyUri;
if (!dstPattern.contains(PropertyGraph2RdfConverter.REPLACEMENT_VARIABLE)) {
throw new Csv2RdfException(
"The pattern <" + dstPattern + "> for the new URI must contain the replacement variable "
+ PropertyGraph2RdfConverter.REPLACEMENT_VARIABLE + ".");
}
this.dstPattern = dstPattern;
}
/**
* Phase 1: register all URIs; this is a no-op if the paramters do not match as
* described.
*
* @param subject resource to be possibly transformed, must match
* {@link UriPostTransformation#srcPattern}
* @param predicate must match {@link RDF#TYPE}
* @param object must match the {@link UriPostTransformation#typeUri}
*/
public void registerResource(String subject, String predicate, String object) {
if (predicate.equals(RDF.TYPE.toString()) && object.equals(typeUri)) {
Matcher matcher = srcPattern.matcher(subject);
if (matcher.matches()) {
resources.add(subject); // may override (which does not harm)
}
}
}
/**
* Phase 2: register the replacement values; this is a no-op if the predicate
* does not match as described.
*
* @param subject resource to be possibly transformed
* @param predicate must match {@link UriPostTransformation#propertyUri}
* @param object possible replacement value
* @throws Csv2RdfException if there was already a replacement value registered
* for the subject
*/
public void registerReplacementValue(String subject, String predicate, String object) {
if (propertyUri.equals(predicate)) {
if (resource2Value.containsKey(subject) && !resource2Value.get(subject).equals(object)) {
throw new Csv2RdfException("Found duplicate, inconsistent value for <" + subject + ">: " + object
+ " vs. " + resource2Value.get(subject));
}
resource2Value.put(subject, object);
}
}
/**
* Phase 3: apply transformation, return {@code null} if no transformation has
* been applied.
* The replacement value will be URI encoded.
*
* @param uri where the part according to the matching group of
* {@link UriPostTransformation#srcPattern} is going to be replaced
* @return the new URI string or {@code null} if the resource did not match the
* type at {@link UriPostTransformation#typeUri} or no replacement value
* was found
*/
public IRI apply(String uri) {
if (!resources.contains(uri)) {
return null; // type did not match, no op
}
if (!resource2Value.containsKey(uri)) {
log.info("---> No replacement value found for <{}>. Resource was not transformed.", uri);
return null;
}
String value = resource2Value.get(uri);
String resource = dstPattern.replace(PropertyGraph2RdfConverter.REPLACEMENT_VARIABLE, encode(value));
return toValidatedIri(resource);
}
/**
*
* URI encode a value using the UTF-8 encoding scheme
*
* @param value
* @return URI encoded value
* @throws Csv2RdfException if the value could not be encoded
*/
private String encode(String value) {
try {
return URLEncoder.encode(value, StandardCharsets.UTF_8.name());
} catch (UnsupportedEncodingException e) {
throw new Csv2RdfException("Could not encode '" + value + "' when applying " + toString() + ".", e);
}
}
/**
*
* Convert a string into an IRI
*
* @param iri
* @return new {@link IRI}
* @throws Csv2RdfException if the IRI cannot be created
*/
private IRI toValidatedIri(String iri) {
try {
return vf.createIRI(new URI(iri).toString());
} catch (URISyntaxException | IllegalArgumentException e) {
throw new Csv2RdfException("Invalid resource URI <" + iri + "> generated when applying " + toString() + ".",
e);
}
}
@Override
public int hashCode() {
return Objects.hash(srcPattern, typeUri, propertyUri, dstPattern);
}
@Override
public boolean equals(Object obj) {
if (this == obj)
return true;
if (obj == null)
return false;
if (getClass() != obj.getClass())
return false;
UriPostTransformation other = (UriPostTransformation) obj;
if (srcPattern == null) {
if (other.srcPattern != null)
return false;
} else if (other.srcPattern == null) {
return false;
} else if (!srcPattern.pattern().equals(other.srcPattern.pattern())) {
return false;
}
return Objects.equals(propertyUri, other.propertyUri) && Objects.equals(typeUri, other.typeUri)
&& Objects.equals(dstPattern, other.dstPattern);
}
}