From 916bfd53197f07b60b5bcb205967c81b9f4cb280 Mon Sep 17 00:00:00 2001 From: Alexandre Bourret Date: Wed, 17 May 2023 17:59:57 +0200 Subject: [PATCH 1/4] Adding custom dataset to retrieve documents metadata --- .../connector.json | 65 +++++++++++++++++++ .../connector.py | 48 ++++++++++++++ python-lib/sharepoint_client.py | 21 ++++++ 3 files changed, 134 insertions(+) create mode 100644 python-connectors/sharepoint-online_documents-metadata/connector.json create mode 100644 python-connectors/sharepoint-online_documents-metadata/connector.py diff --git a/python-connectors/sharepoint-online_documents-metadata/connector.json b/python-connectors/sharepoint-online_documents-metadata/connector.json new file mode 100644 index 0000000..8d5d380 --- /dev/null +++ b/python-connectors/sharepoint-online_documents-metadata/connector.json @@ -0,0 +1,65 @@ +{ + "meta" : { + "label": "Documents' metadata", + "description": "Retrieve metadata for all documents stored on your SharePoint server", + "icon": "icon-cloud" + }, + "readable": true, + "writable": false, + "params": [ + { + "name": "auth_type", + "label": "Type of authentication", + "type": "SELECT", + "selectChoices": [ + { + "value": "login", + "label": "User name / password" + }, + { + "value": "oauth", + "label": "Azure Single Sign On" + }, + { + "value": "site-app-permissions", + "label": "Site App Permissions" + } + ] + }, + { + "name": "sharepoint_oauth", + "label": "Azure preset", + "type": "PRESET", + "parameterSetId": "oauth-login", + "visibilityCondition": "model.auth_type == 'oauth'" + }, + { + "name": "sharepoint_sharepy", + "label": "SharePoint preset", + "type": "PRESET", + "parameterSetId": "sharepoint-login", + "visibilityCondition": "model.auth_type == 'login'" + }, + { + "name": "site_app_permissions", + "label": "Site App preset", + "type": "PRESET", + "parameterSetId": "site-app-permissions", + "visibilityCondition": "model.auth_type == 'site-app-permissions'" + }, + { + "name": "advanced_parameters", + "label": "Show advanced parameters", + "description": "", + "type": "BOOLEAN", + "defaultValue": false + }, + { + "name": "sharepoint_site_overwrite", + "label": "Site path preset overwrite", + "type": "STRING", + "description": "sites/site_name/subsite...", + "visibilityCondition": "model.advanced_parameters == true" + } + ] +} diff --git a/python-connectors/sharepoint-online_documents-metadata/connector.py b/python-connectors/sharepoint-online_documents-metadata/connector.py new file mode 100644 index 0000000..ec61963 --- /dev/null +++ b/python-connectors/sharepoint-online_documents-metadata/connector.py @@ -0,0 +1,48 @@ +from sharepoint_client import SharePointClient +from common import ItemsLimit +from dataiku.connector import Connector + + +class SharePointDocumentsMetadataConnector(Connector): + + def __init__(self, config, plugin_config): + Connector.__init__(self, config, plugin_config) + self.client = SharePointClient(config) + + def get_read_schema(self): + return None + + def generate_rows(self, dataset_schema=None, dataset_partitioning=None, + partition_id=None, records_limit=-1): + limit = ItemsLimit(records_limit) + for row in self.client.get_documents_medatada(): + yield row + if limit.is_reached(): + break + + def get_writer(self, dataset_schema=None, dataset_partitioning=None, + partition_id=None): + raise NotImplementedError + + def get_partitioning(self): + raise NotImplementedError + + def list_partitions(self, partitioning): + return [] + + def partition_exists(self, partitioning, partition_id): + raise NotImplementedError + + def get_records_count(self, partitioning=None, partition_id=None): + raise NotImplementedError + + +class CustomDatasetWriter(object): + def __init__(self): + pass + + def write_row(self, row): + raise NotImplementedError + + def close(self): + pass diff --git a/python-lib/sharepoint_client.py b/python-lib/sharepoint_client.py index 248a079..b5795a0 100644 --- a/python-lib/sharepoint_client.py +++ b/python-lib/sharepoint_client.py @@ -324,6 +324,27 @@ def get_list_items(self, list_title, params=None): self.assert_response_ok(response, calling_method="get_list_items") return response.json().get("ListData", {}) + def get_documents_medatada(self): + headers = DSSConstants.JSON_HEADERS + url = "{}/{}/_vti_bin/listdata.svc/Documents".format(self.sharepoint_origin, self.sharepoint_site) + first = True + while url: + params = None + if first: + params = {"Query": "*"} + first = False + response = self.session.get( + url=url, + headers=headers, + params=params + ) + self.assert_response_ok(response, calling_method="get_documents_medatada") + json_response = response.json() + url = get_value_from_path(json_response, [SharePointConstants.RESULTS_CONTAINER_V2, SharePointConstants.NEXT_PAGE]) + rows = get_value_from_path(json_response, [SharePointConstants.RESULTS_CONTAINER_V2, "results"]) + for row in rows: + yield row + def create_list(self, list_name): headers = DSSConstants.JSON_HEADERS data = { From f04788830703f582deeaf596ec2b1c1df63009bd Mon Sep 17 00:00:00 2001 From: Alexandre Bourret Date: Wed, 17 May 2023 18:08:15 +0200 Subject: [PATCH 2/4] Version++ --- plugin.json | 2 +- .../sharepoint-online_documents-metadata/connector.py | 6 ++++++ python-lib/dss_constants.py | 1 + 3 files changed, 8 insertions(+), 1 deletion(-) diff --git a/plugin.json b/plugin.json index 321795d..86d2771 100644 --- a/plugin.json +++ b/plugin.json @@ -1,6 +1,6 @@ { "id": "sharepoint-online", - "version": "1.0.14", + "version": "1.1.0", "meta": { "label": "SharePoint Online", "description": "Read and write data from/to your SharePoint Online account", diff --git a/python-connectors/sharepoint-online_documents-metadata/connector.py b/python-connectors/sharepoint-online_documents-metadata/connector.py index ec61963..c6954f5 100644 --- a/python-connectors/sharepoint-online_documents-metadata/connector.py +++ b/python-connectors/sharepoint-online_documents-metadata/connector.py @@ -1,12 +1,18 @@ from sharepoint_client import SharePointClient from common import ItemsLimit from dataiku.connector import Connector +from safe_logger import SafeLogger +from dss_constants import DSSConstants + + +logger = SafeLogger("sharepoint-online plugin", DSSConstants.SECRET_PARAMETERS_KEYS) class SharePointDocumentsMetadataConnector(Connector): def __init__(self, config, plugin_config): Connector.__init__(self, config, plugin_config) + logger.info('SharePoint Online plugin metadata dataset v{}'.format(DSSConstants.PLUGIN_VERSION)) self.client = SharePointClient(config) def get_read_schema(self): diff --git a/python-lib/dss_constants.py b/python-lib/dss_constants.py index 6a49f44..44396a9 100644 --- a/python-lib/dss_constants.py +++ b/python-lib/dss_constants.py @@ -28,6 +28,7 @@ class DSSConstants(object): "sharepoint_oauth": "The access token is missing" } PATH = 'path' + PLUGIN_VERSION = '1.1.0-beta.1' SECRET_PARAMETERS_KEYS = ["Authorization", "sharepoint_username", "sharepoint_password", "client_secret"] SITE_APP_DETAILS = { "sharepoint_tenant": "The tenant name is missing", From 3fdb7a016d25a0208287f69c0a85043dbeb83c89 Mon Sep 17 00:00:00 2001 From: Alexandre Bourret Date: Wed, 17 May 2023 18:10:31 +0200 Subject: [PATCH 3/4] Update changelog --- CHANGELOG.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 7433f87..936ba21 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,9 @@ # Changelog +## [Version 1.1.0](https://github.com/dataiku/dss-plugin-sharepoint-online/releases/tag/v1.1.0) - Feature release - 2023-05-17 + +- Adding dataset for documents metadata retrieval + ## [Version 1.0.14](https://github.com/dataiku/dss-plugin-sharepoint-online/releases/tag/v1.0.14) - Bugfix release - 2023-04-18 - Updated code-env descriptor for DSS 12 From 80822e0225d34fdb2fc5ba5f5b06d812e292f15f Mon Sep 17 00:00:00 2001 From: Alexandre Bourret Date: Fri, 19 May 2023 16:17:22 +0200 Subject: [PATCH 4/4] Adding a start path --- .../connector.json | 13 +++++++++++++ .../connector.py | 3 ++- python-lib/sharepoint_client.py | 15 +++++++++------ 3 files changed, 24 insertions(+), 7 deletions(-) diff --git a/python-connectors/sharepoint-online_documents-metadata/connector.json b/python-connectors/sharepoint-online_documents-metadata/connector.json index 8d5d380..ea2b4d1 100644 --- a/python-connectors/sharepoint-online_documents-metadata/connector.json +++ b/python-connectors/sharepoint-online_documents-metadata/connector.json @@ -60,6 +60,19 @@ "type": "STRING", "description": "sites/site_name/subsite...", "visibilityCondition": "model.advanced_parameters == true" + }, + { + "name": "sharepoint_root_overwrite", + "label": "Root directory preset overwrite", + "type": "STRING", + "description": "", + "visibilityCondition": "model.advanced_parameters == true" + }, + { + "name": "search_path", + "label": "Path to folder", + "type": "STRING", + "description": "Path to your folder of interest..." } ] } diff --git a/python-connectors/sharepoint-online_documents-metadata/connector.py b/python-connectors/sharepoint-online_documents-metadata/connector.py index c6954f5..b5dccdd 100644 --- a/python-connectors/sharepoint-online_documents-metadata/connector.py +++ b/python-connectors/sharepoint-online_documents-metadata/connector.py @@ -14,6 +14,7 @@ def __init__(self, config, plugin_config): Connector.__init__(self, config, plugin_config) logger.info('SharePoint Online plugin metadata dataset v{}'.format(DSSConstants.PLUGIN_VERSION)) self.client = SharePointClient(config) + self.search_path = config.get("search_path", None) def get_read_schema(self): return None @@ -21,7 +22,7 @@ def get_read_schema(self): def generate_rows(self, dataset_schema=None, dataset_partitioning=None, partition_id=None, records_limit=-1): limit = ItemsLimit(records_limit) - for row in self.client.get_documents_medatada(): + for row in self.client.get_documents_medatada(search_path=self.search_path): yield row if limit.is_reached(): break diff --git a/python-lib/sharepoint_client.py b/python-lib/sharepoint_client.py index b5795a0..26a00e6 100644 --- a/python-lib/sharepoint_client.py +++ b/python-lib/sharepoint_client.py @@ -324,23 +324,26 @@ def get_list_items(self, list_title, params=None): self.assert_response_ok(response, calling_method="get_list_items") return response.json().get("ListData", {}) - def get_documents_medatada(self): + def get_documents_medatada(self, search_path=None): headers = DSSConstants.JSON_HEADERS - url = "{}/{}/_vti_bin/listdata.svc/Documents".format(self.sharepoint_origin, self.sharepoint_site) + next_page_url = "{}/{}/_vti_bin/listdata.svc/Documents".format(self.sharepoint_origin, self.sharepoint_site) first = True - while url: + initial_params = {"Query": "*"} + if search_path: + initial_params.update({"$filter": "Path eq '/{}/{}/{}'".format(self.sharepoint_site, self.sharepoint_root, search_path.strip("/"))}) + while next_page_url: params = None if first: - params = {"Query": "*"} + params = initial_params first = False response = self.session.get( - url=url, + url=next_page_url, headers=headers, params=params ) self.assert_response_ok(response, calling_method="get_documents_medatada") json_response = response.json() - url = get_value_from_path(json_response, [SharePointConstants.RESULTS_CONTAINER_V2, SharePointConstants.NEXT_PAGE]) + next_page_url = get_value_from_path(json_response, [SharePointConstants.RESULTS_CONTAINER_V2, SharePointConstants.NEXT_PAGE]) rows = get_value_from_path(json_response, [SharePointConstants.RESULTS_CONTAINER_V2, "results"]) for row in rows: yield row